public override Task <DataFlowResult> Parse(DataFlowContext context) { var response = context.GetResponse(); //context.AddItem("URL", response.Request.Url); //context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); Console.ForegroundColor = ConsoleColor.Blue; Console.WriteLine("第一页:" + context.GetSelectable().XPath(".//title").GetValue()); Console.ForegroundColor = ConsoleColor.White; //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes().Count(); Dictionary <string, string> pageSet = new Dictionary <string, string>(); var pages = context.GetSelectable().XPath("//*[@id=\"listdiv\"]/div[@class='pagesYY']/div/a[not(@class)]/@href").GetValues(); var requestList = new List <Request>(); foreach (var page in pages) { if (!pageSet.ContainsKey(page)) { var request = new Request(); request.Url = page; request.OwnerId = response.Request.OwnerId; requestList.Add(request); pageSet.Add(page, page); } } if (requestList.Count > 0) { context.AddTargetRequests(requestList.ToArray()); } return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { var response = context.GetResponse(); context.AddItem("URL", response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); Dictionary <string, string> tags = new Dictionary <string, string>(); var tagNodes = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes(); foreach (var node in tagNodes) { var url = node.XPath("./@href").GetValue(); var name = node.GetValue(); tags.Add(url, name); Console.WriteLine("url:" + url + " - name:" + name); } var requests = new List <Request>(); foreach (var tag in tags) { var request = new Request { Url = tag.Key, OwnerId = response.Request.OwnerId, }; request.Properties.Add("tag", tag.Value); requests.Add(request); } context.AddTargetRequests(requests.ToArray()); return(Task.FromResult(DataFlowResult.Success)); }
/// <summary> /// 取得详细图片查看的分页url /// </summary> /// <param name="context"></param> public static void GetDetailPageUrl(DataFlowContext context) { var response = context.GetResponse(); Dictionary <string, string> pageSet = new Dictionary <string, string>(); var pages = context.GetSelectable().XPath("//*[@id=\"pages\"]/a[not(@class)]/@href").GetValues(); var requestList = new List <Request>(); foreach (var page in pages) { if (!pageSet.ContainsKey(page)) { var request = new Request { Url = page, OwnerId = response.Request.OwnerId, }; request.AddProperty("tag", response.Request.GetProperty("tag")); request.AddProperty("referer", response.Request.GetProperty("referer")); requestList.Add(request); pageSet.Add(page, page); } } if (requestList.Count > 0) { context.AddTargetRequests(requestList.ToArray()); } }
public override Task <DataFlowResult> Parse(DataFlowContext context) { var response = context.GetResponse(); //context.AddItem("URL", response.Request.Url); //context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); //var result = CheckType(response.Request.Url); Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("page 页:" + context.GetSelectable().XPath(".//title").GetValue()); Console.ForegroundColor = ConsoleColor.White; //var subs = context.GetSelectable().XPath("*[@id='post_rank']/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues(); //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[2]/ul/li/a[1]/@href").GetValue(); //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes().Count(); /*var pages = context.GetSelectable().XPath("//*[@id=\"listdiv\"]/div[3]/div/a[not(@class)]/@href").GetValues(); * foreach (var page in pages) * { * var request = new Request(); * request.Url = page; * request.OwnerId = response.Request.OwnerId; * context.AddTargetRequests(request); * }*/ return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { context.AddItem("URL", context.Response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { var selectable = context.GetSelectable(); // 解析数据 var author = selectable.XPath("//span[@class='p-name vcard-fullname d-block overflow-hidden']") .GetValue(); var name = selectable.XPath("//span[@class='p-nickname vcard-username d-block']") .GetValue(); context.AddItem("author", author); context.AddItem("username", name); // 添加目标链接 var urls = selectable.Links().Regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").GetValues(); AddFollowRequests(context, urls); // 如果解析为空,跳过后续步骤(存储 etc) if (string.IsNullOrWhiteSpace(name)) { context.ClearItems(); return(Task.FromResult(DataFlowResult.Terminated)); } return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { var selectable = context.GetSelectable(); // 解析数据 var name = selectable.XPath("//*[@id=\"subject_list\"]/ul/li[1]/div[2]/h2/a") .GetValue(); var author = selectable.XPath("//*[@id=\"subject_list\"]/ul/li[1]/div[2]/div[1]") .GetValue(); context.AddItem("author", author); context.AddItem("username", name); // 添加目标链接 var urls = selectable.Links().Regex("(https://book.douban\\.com/tag/[\\w\\-]+)").GetValues(); AddTargetRequests(context, urls); // 如果解析为空,跳过后续步骤(存储 etc) if (string.IsNullOrWhiteSpace(name)) { context.ClearItems(); return(Task.FromResult(DataFlowResult.Terminated)); } return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { var selectable = context.GetSelectable(); // Parsing data var author = selectable.XPath("//span[@class='p-name vcard-fullname d-block overflow-hidden']") .GetValue(); var name = selectable.XPath("//span[@class='p-nickname vcard-username d-block']") .GetValue(); context.AddItem("author", author); context.AddItem("username", name); // Add target link var urls = selectable.Links().Regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").GetValues(); AddTargetRequests(context, urls); // If the parsing is empty, skip the next step if (string.IsNullOrWhiteSpace(name)) { context.ClearItems(); return(Task.FromResult(DataFlowResult.Terminated)); } return(Task.FromResult(DataFlowResult.Success)); }
public override Task <DataFlowResult> Parse(DataFlowContext context) { var response = context.GetResponse(); context.AddItem("URL", response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); Dictionary <string, string> tags = new Dictionary <string, string>(); var tagNodes = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes(); foreach (var node in tagNodes) { var url = node.XPath("./@href").GetValue(); var name = node.GetValue(); tags.Add(url, name); Console.WriteLine("url:" + url + " - name:" + name); } var requests = new List <Request>(); foreach (var sub in tags) { var request = new Request { Url = sub.Key, OwnerId = response.Request.OwnerId }; requests.Add(request); CreateDirByTag(sub.Value); } context.AddTargetRequests(requests.ToArray()); /*var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues(); * var requests = new List<Request>(); * foreach (var sub in subs) * { * var request = new Request(); * request.Url = sub; * request.OwnerId = response.Request.OwnerId; * requests.Add(request); * Console.WriteLine("sub parse:" + sub); * } * context.AddTargetRequests(requests.ToArray());*/ return(Task.FromResult(DataFlowResult.Success)); }
//public DatabaseSpiderDataParser() //{ // CanParse = DataParserHelper.CanParseByRegex("cnblogs\\.com"); // QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath("."); //} protected override Task <DataFlowResult> Parse(DataFlowContext context) { context.AddItem("URL", context.Response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); #region add mysql database var typeName = typeof(EntitySpider.CnblogsEntry).FullName; var entity = new EntitySpider.CnblogsEntry(); context.Add(typeName, entity.GetTableMetadata()); ParseResult <EntitySpider.CnblogsEntry> items = new ParseResult <EntitySpider.CnblogsEntry>(); entity.WebSite = context.Response.Request.Url; entity.Url = context.Response.Request.Url; entity.Title = context.GetSelectable().XPath(".//title").GetValue(); items.Add(entity); context.AddParseItem(typeName, items); #endregion return(Task.FromResult(DataFlowResult.Success)); }
/// <summary> /// 获取图片浏览页里抽图片地址 /// </summary> /// <param name="context"></param> public static void GetDetailPictureUrl(DataFlowContext context) { context.AddItem("URL", context.Response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); var images = context.GetSelectable().XPath("//*[@id=\"hgallery\"]/img/@src").GetValues(); foreach (var image in images) { //处理图片URL下载 var request = new Request { Url = image, OwnerId = context.Response.Request.OwnerId }; request.AddProperty("tag", context.Response.Request.GetProperty("tag")); request.AddProperty("referer", context.Response.Request.GetProperty("referer")); request.AddProperty("subject", context.GetSelectable().XPath(".//title").GetValue()); ImageDownloader.GetInstance().AddRequest(request); } }
public override Task <DataFlowResult> Parse(DataFlowContext context) { var response = context.GetResponse(); context.AddItem("URL", response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); //var result = CheckType(response.Request.Url); //Console.WriteLine("type:"+result); //var subs = context.GetSelectable().XPath("*[@id='post_rank']/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues(); //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[2]/ul/li/a[1]/@href").GetValue(); //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes().Count(); var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues(); foreach (var sub in subs) { CreateFromRequest(response.Request, sub); } return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { //context.AddItem("URL", response.Request.Url); //context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("page 页:" + context.GetSelectable().XPath(".//title").GetValue()); Console.ForegroundColor = ConsoleColor.White; GetSubjectUrl(context); return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { var nextPageUrl = context.GetSelectable().XPath(_nextPageSelector).Links().GetValue(); if (!string.IsNullOrWhiteSpace(nextPageUrl)) { var followRequest = CreateFromRequest(context, nextPageUrl); followRequest.PageIndex = context.Response.Request.PageIndex + 1; if (CanParse == null || CanParse(followRequest)) { context.FollowRequests.Add(followRequest); } } return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { var news = context.GetSelectable().XPath(".//[@class=\"news_block\"]").Nodes(); var newsObjs = new List <News>(); foreach (var item in news) { var url = item.Select(Selectors.XPath(".//h2[@class=\"news_entry\"]/a/@href")).GetValue(); var summary = item.Select(Selectors.XPath(".//div[@class=\"entry_summary\"]")).GetValue(); var views = int.Parse(item.Select(Selectors.XPath(".//span[@class=\"view\"")).GetValue() .Replace("", " 人浏览")); newsObjs.Add(new News { }); } //context.AddItem("Title",); return(Task.FromResult(DataFlowResult.Success)); }
/// <summary> /// 获取主题的地址 /// </summary> /// <param name="context"></param> public static void GetSubjectUrl(DataFlowContext context) { var response = context.GetResponse(); var pages = context.GetSelectable().XPath("//*[@id=\"listdiv\"]/ul/li/div[@class='galleryli_title']/a/@href").GetValues(); var requestList = new List <Request>(); foreach (var page in pages) { var request = new Request { Url = page, OwnerId = response.Request.OwnerId, }; request.AddProperty("tag", response.Request.GetProperty("tag")); request.AddProperty("referer", response.Request.Url); requestList.Add(request); } if (requestList.Count > 0) { context.AddTargetRequests(requestList.ToArray()); } }
/// <summary> /// 解析画册的分页 /// </summary> /// <param name="context"></param> public static void GetSubjectPageUrl(DataFlowContext context) { var response = context.GetResponse(); Dictionary <string, string> pageSet = new Dictionary <string, string>(); var pages = context.GetSelectable().XPath("//*[@id=\"listdiv\"]/div[@class='pagesYY']/div/a[not(@class)]/@href").GetValues(); var requestList = new List <Request>(); foreach (var page in pages) { if (!pageSet.ContainsKey(page)) { try { var request = new Request { Url = page, OwnerId = response.Request.OwnerId, }; //request.Properties.Add("tag", response.Request.Properties["tag"]); request.AddProperty("tag", "萝莉"); requestList.Add(request); pageSet.Add(page, page); } catch (Exception e) { Console.WriteLine(e); } } } if (requestList.Count > 0) { context.AddTargetRequests(requestList.ToArray()); } }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { if (!context.Contains(_model.TypeName)) { context.Add(_model.TypeName, _tableMetadata); } var selectable = context.GetSelectable(); var results = new ParseResult <T>(); if (selectable.Properties == null) { selectable.Properties = new Dictionary <string, object>(); } var environments = new Dictionary <string, string>(); foreach (var property in context.Response.Request.Properties) { environments.Add(property.Key, property.Value); } if (_model.ShareValueSelectors != null) { foreach (var selector in _model.ShareValueSelectors) { string name = selector.Name; var value = selectable.Select(selector.ToSelector()).GetValue(); if (!environments.ContainsKey(name)) { environments.Add(name, value); } else { environments[name] = value; } } } bool singleExtractor = _model.Selector == null; if (!singleExtractor) { var selector = _model.Selector.ToSelector(); var list = selectable.SelectList(selector).Nodes()?.ToList(); if (list != null) { if (_model.Take > 0 && list.Count > _model.Take) { list = _model.TakeFromHead ? list.Take(_model.Take).ToList() : list.Skip(list.Count - _model.Take).ToList(); } for (var i = 0; i < list.Count; ++i) { var item = list.ElementAt(i); var obj = ParseObject(environments, item, i); if (obj != null) { results.Add(obj); } else { Logger?.LogWarning($"解析到空数据,类型: {_model.TypeName}"); } } } } else { var obj = ParseObject(environments, selectable, 0); if (obj != null) { results.Add(obj); } else { Logger?.LogWarning($"解析到空数据,类型: {_model.TypeName}"); } } if (results.Count > 0) { var items = context.GetParseItem(_model.TypeName); if (items == null) { context.AddParseItem(_model.TypeName, results); } else { ((ParseResult <T>)items).AddRange(results); } } return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { if (_mapping != null) { if (_mapping.Deepth.GetValueOrDefault() >= 1) { if (context.Response.Request.Depth != _mapping.Deepth.Value) { context.ClearItems(); return(Task.FromResult(DataFlowResult.Success)); } } if (!string.IsNullOrWhiteSpace(_mapping.ItemCssSelector)) { var items = new List <dynamic>(); var itemNodes = context.GetSelectable().XPath(_mapping.ItemCssSelector).Nodes(); foreach (var note in itemNodes) { var item = new Dictionary <string, string>(); foreach (var field in _mapping.Mapping) { item.Add(field.Field, note.XPath(field.CssSelector).GetValue()); } if (item.Count > 0) { item.Add("PageSourceURL", context.Response.Request.Url); items.Add(item); } } if (items.Count > 0) { context.AddItem("Content", JsonConvert.SerializeObject(items)); } } else { if (_mapping.Mapping != null && _mapping.Mapping.Length > 0) { var item = new Dictionary <string, string>(); foreach (var field in _mapping.Mapping) { var value = context.GetSelectable().XPath(field.CssSelector).GetValue(); if (value != null) { value = value.Replace("\t", "").Trim(); } item.Add(field.Field, value); } if (item.Count > 0) { item.Add("PageSourceURL", context.Response.Request.Url); context.AddItem("Content", JsonConvert.SerializeObject(item, Formatting.Indented)); } } else { context.AddItem("PageSourceURL", context.Response.Request.Url); context.AddItem("Content", context.Response.RawText); } } } //var item = context.GetSelectable().XPath("//h1[@class='title_news_detail mb10']").GetValue(); //var item = context.GetSelectable().XPath("//h1[@class='title_news_detail mb10']").GetValue(); //if (!string.IsNullOrWhiteSpace(item)) //{ // // context.AddItem("Vnexpress", item); // context.AddItem("Content:", context.Response.RawText); //} //else // context.ClearItems(); return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { context.AddItem("URL", context.Response.Request.Url); context.AddItem("Quotes", context.GetSelectable().XPath("//div[@class='quote']").GetValues()); return(Task.FromResult(DataFlowResult.Success)); }