protected override Task <DataFlowResult> Parse(DataFlowContext context) { var response = context.GetResponse(); context.AddItem("URL", response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); Dictionary <string, string> tags = new Dictionary <string, string>(); var tagNodes = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes(); foreach (var node in tagNodes) { var url = node.XPath("./@href").GetValue(); var name = node.GetValue(); tags.Add(url, name); Console.WriteLine("url:" + url + " - name:" + name); } var requests = new List <Request>(); foreach (var tag in tags) { var request = new Request { Url = tag.Key, OwnerId = response.Request.OwnerId, }; request.Properties.Add("tag", tag.Value); requests.Add(request); } context.AddTargetRequests(requests.ToArray()); return(Task.FromResult(DataFlowResult.Success)); }
/// <summary> /// 取得详细图片查看的分页url /// </summary> /// <param name="context"></param> public static void GetDetailPageUrl(DataFlowContext context) { var response = context.GetResponse(); Dictionary <string, string> pageSet = new Dictionary <string, string>(); var pages = context.GetSelectable().XPath("//*[@id=\"pages\"]/a[not(@class)]/@href").GetValues(); var requestList = new List <Request>(); foreach (var page in pages) { if (!pageSet.ContainsKey(page)) { var request = new Request { Url = page, OwnerId = response.Request.OwnerId, }; request.AddProperty("tag", response.Request.GetProperty("tag")); request.AddProperty("referer", response.Request.GetProperty("referer")); requestList.Add(request); pageSet.Add(page, page); } } if (requestList.Count > 0) { context.AddTargetRequests(requestList.ToArray()); } }
public override Task <DataFlowResult> Parse(DataFlowContext context) { var response = context.GetResponse(); //context.AddItem("URL", response.Request.Url); //context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); Console.ForegroundColor = ConsoleColor.Blue; Console.WriteLine("第一页:" + context.GetSelectable().XPath(".//title").GetValue()); Console.ForegroundColor = ConsoleColor.White; //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes().Count(); Dictionary <string, string> pageSet = new Dictionary <string, string>(); var pages = context.GetSelectable().XPath("//*[@id=\"listdiv\"]/div[@class='pagesYY']/div/a[not(@class)]/@href").GetValues(); var requestList = new List <Request>(); foreach (var page in pages) { if (!pageSet.ContainsKey(page)) { var request = new Request(); request.Url = page; request.OwnerId = response.Request.OwnerId; requestList.Add(request); pageSet.Add(page, page); } } if (requestList.Count > 0) { context.AddTargetRequests(requestList.ToArray()); } return(Task.FromResult(DataFlowResult.Success)); }
public override Task <DataFlowResult> Parse(DataFlowContext context) { var response = context.GetResponse(); context.AddItem("URL", response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); Dictionary <string, string> tags = new Dictionary <string, string>(); var tagNodes = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes(); foreach (var node in tagNodes) { var url = node.XPath("./@href").GetValue(); var name = node.GetValue(); tags.Add(url, name); Console.WriteLine("url:" + url + " - name:" + name); } var requests = new List <Request>(); foreach (var sub in tags) { var request = new Request { Url = sub.Key, OwnerId = response.Request.OwnerId }; requests.Add(request); CreateDirByTag(sub.Value); } context.AddTargetRequests(requests.ToArray()); /*var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues(); * var requests = new List<Request>(); * foreach (var sub in subs) * { * var request = new Request(); * request.Url = sub; * request.OwnerId = response.Request.OwnerId; * requests.Add(request); * Console.WriteLine("sub parse:" + sub); * } * context.AddTargetRequests(requests.ToArray());*/ return(Task.FromResult(DataFlowResult.Success)); }
public override async Task <DataFlowResult> HandleAsync(DataFlowContext context) { try { var response = context.GetResponse(); var request = response.Request; // 如果不匹配则终止数据流程 if (CanParse != null && !CanParse(request)) { return(DataFlowResult.Terminated); } Selectable?.Invoke(context); var parserResult = await Parse(context); if (parserResult == DataFlowResult.Failed || parserResult == DataFlowResult.Terminated) { return(parserResult); } var urls = Follow?.Invoke(context); if (urls != null && urls.Length > 0) { var followRequests = new List <Request>(); foreach (var url in urls) { var followRequest = CreateFromRequest(request, url); if (CanParse(followRequest)) { followRequests.Add(followRequest); } } context.AddTargetRequests(followRequests.ToArray()); } return(DataFlowResult.Success); } catch (Exception e) { Logger?.LogError($"数据解析发生异常: {e}"); return(DataFlowResult.Failed); } }
/// <summary> /// 获取主题的地址 /// </summary> /// <param name="context"></param> public static void GetSubjectUrl(DataFlowContext context) { var response = context.GetResponse(); var pages = context.GetSelectable().XPath("//*[@id=\"listdiv\"]/ul/li/div[@class='galleryli_title']/a/@href").GetValues(); var requestList = new List <Request>(); foreach (var page in pages) { var request = new Request { Url = page, OwnerId = response.Request.OwnerId, }; request.AddProperty("tag", response.Request.GetProperty("tag")); request.AddProperty("referer", response.Request.Url); requestList.Add(request); } if (requestList.Count > 0) { context.AddTargetRequests(requestList.ToArray()); } }
/// <summary> /// 解析画册的分页 /// </summary> /// <param name="context"></param> public static void GetSubjectPageUrl(DataFlowContext context) { var response = context.GetResponse(); Dictionary <string, string> pageSet = new Dictionary <string, string>(); var pages = context.GetSelectable().XPath("//*[@id=\"listdiv\"]/div[@class='pagesYY']/div/a[not(@class)]/@href").GetValues(); var requestList = new List <Request>(); foreach (var page in pages) { if (!pageSet.ContainsKey(page)) { try { var request = new Request { Url = page, OwnerId = response.Request.OwnerId, }; //request.Properties.Add("tag", response.Request.Properties["tag"]); request.AddProperty("tag", "萝莉"); requestList.Add(request); pageSet.Add(page, page); } catch (Exception e) { Console.WriteLine(e); } } } if (requestList.Count > 0) { context.AddTargetRequests(requestList.ToArray()); } }