protected override Task <DataFlowResult> Parse(DataFlowContext context) { Dictionary <string, string> tags = new Dictionary <string, string>(); var tagNodes = context.Selectable.XPath(".//div[@class='TypeList']//ul//li").Nodes(); foreach (var node in tagNodes) { var url = node.XPath(".//a[@class='TypeBigPics']/@href").GetValue(); var name = node.XPath(".//div[@class='ListTit']").GetValue(); tags.Add(url, name); } var requests = new List <Request>(); foreach (var tag in tags) { var request = new Request { Url = tag.Key, OwnerId = context.Response.Request.OwnerId }; //Console.WriteLine(tag.Key); request.AddProperty("tag", tag.Value); request.AddProperty("referer", context.Response.Request.GetProperty("referer") ?? tag.Key); request.AddProperty("subject", context.Selectable.XPath(".//title").GetValue()); requests.Add(request); } //如果当前为第一页找到最后一页并将2-最后一页加入到请求中 var thisPage = context.Selectable.XPath(".//div[@class='NewPages']//ul//li[3]//a").GetValue(); if (thisPage == "2") { tagNodes = context.Selectable.XPath(".//div[@class='NewPages']//ul//li[last()]//a").Nodes(); foreach (var node in tagNodes) { var href = node.XPath("@href").Regex("\\d{1,3}.htm").GetValue().Replace(".htm", ""); int.TryParse(href, out var totalPage); var reg = new Regex("\\d{1,3}.htm"); Request[] reqArry = new Request[totalPage - 1]; for (int i = 2; i <= totalPage; i++) { var url = reg.Replace(context.Response.Request.Url, $"{i}.htm"); var request = new Request() { Url = url, OwnerId = context.Response.Request.OwnerId }; reqArry[i - 2] = request; } context.AddExtraRequests(reqArry); } } context.AddExtraRequests(requests.ToArray()); return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { context.AddData("URL", context.Response.Request.Url); context.AddData("Title", context.Selectable.XPath(".//title").GetValue()); var tags = new Dictionary <string, string>(); var tagNodes = context.Selectable .XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes(); foreach (var node in tagNodes) { var url = node.XPath("./@href").GetValue(); var name = node.GetValue(); tags.Add(url, name); Console.WriteLine("url:" + url + " - name:" + name); } var requests = new List <Request>(); foreach (var tag in tags) { var request = new Request { Url = tag.Key, OwnerId = context.Response.Request.OwnerId }; request.AddProperty("tag", tag.Value); requests.Add(request); } context.AddExtraRequests(requests.ToArray()); return(Task.FromResult(DataFlowResult.Success)); }
/// <summary> /// 取得详细图片查看的分页url /// </summary> /// <param name="context"></param> public static void GetDetailPageUrl(DataFlowContext context) { var pageSet = new Dictionary <string, string>(); var pages = context.Selectable.XPath("//*[@id=\"pages\"]/a[not(@class)]/@href").GetValues(); var requestList = new List <Request>(); foreach (var page in pages) { if (!pageSet.ContainsKey(page)) { var request = new Request { Url = page, OwnerId = context.Response.Request.OwnerId }; request.AddProperty("tag", context.Response.Request.GetProperty("tag")); request.AddProperty("referer", context.Response.Request.GetProperty("referer")); requestList.Add(request); pageSet.Add(page, page); } } if (requestList.Count > 0) { context.AddExtraRequests(requestList.ToArray()); } }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { var selectable = context.Selectable; // 解析数据 var author = selectable.XPath("//span[@class='p-name vcard-fullname d-block overflow-hidden']") .GetValue(); var name = selectable.XPath("//span[@class='p-nickname vcard-username d-block']") .GetValue(); context.AddData("author", author); context.AddData("username", name); // 添加目标链接 var urls = selectable.Links().Regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").GetValues(); foreach (var url in urls) { context.AddExtraRequests(CreateFromRequest(context.Response.Request, url)); } // 如果解析为空,跳过后续步骤(存储 etc) if (string.IsNullOrWhiteSpace(name)) { context.ClearData(); return(Task.FromResult(DataFlowResult.Terminated)); } return(Task.FromResult(DataFlowResult.Success)); }
/// <summary> /// 数据解析 /// </summary> /// <param name="context">处理上下文</param> /// <returns></returns> public override async Task <DataFlowResult> HandleAsync(DataFlowContext context) { if (context?.Response == null) { Logger?.LogError("数据上下文或者响应内容为空"); return(DataFlowResult.Failed); } try { // 如果不匹配则跳过,不影响其它数据流处理器的执行 if (Required != null && !Required(context.Response.Request)) { return(DataFlowResult.Success); } if (context.Selectable == null) { context.Selectable = SelectableBuilder != null ? SelectableBuilder(context) : context.Response?.ToSelectable(); } var dataFlowResult = await Parse(context); var requests = FollowRequestQuerier == null ? new List <Request>(0) : FollowRequestQuerier(context); if (requests != null && requests.Count > 0) { foreach (var request in requests) { if (request != null && (Required == null || Required(request))) { // 在此强制设制 OwnerId, 防止用户忘记导致出错 if (string.IsNullOrWhiteSpace(request.OwnerId)) { request.OwnerId = context.Response.Request.OwnerId; request.AgentId = context.Response.Request.AgentId; } context.AddExtraRequests(request); } } } if (dataFlowResult == DataFlowResult.Failed || dataFlowResult == DataFlowResult.Terminated) { return(dataFlowResult); } return(DataFlowResult.Success); } catch (Exception e) { Logger?.LogError($"任务 {context.Response.Request.OwnerId} 数据解析发生异常: {e}"); return(DataFlowResult.Failed); } }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { Console.WriteLine(context.Response.Request.Url); Dictionary <string, string> tags = new Dictionary <string, string>(); var tagNodes = context.Selectable.Regex("Next(.+)").Nodes(); foreach (var node in tagNodes) { //找到页面函数里面有总页数和当前页面ID var el = node.GetValue().Replace("Next(", "").Replace("\\", "").Replace("\"", ""); var elArry = el.Split(','); int.TryParse(elArry[1], out int pages); if (pages > 1 && elArry[0] == "1") { var requests = new List <Request>(); for (int i = 2; i <= pages; i++) { var request = new Request() { OwnerId = context.Response.Request.OwnerId, Url = context.Response.Request.Url.Replace(".htm", $"_{i}.htm") }; request.AddProperty("tag", context.Selectable.XPath(".//title").GetValue()); requests.Add(request); } Console.WriteLine($"{context.Response.Request.Url}\t{pages}\t{requests.Count}"); count += pages; context.AddExtraRequests(requests.ToArray());; } } var imgNodes = context.Selectable.XPath(".//div[@id='ArticleId0']//p//a//img").Nodes(); foreach (var nodes in imgNodes) { var url = nodes.XPath("@src").GetValue(); var newNode = (nodes as Selectable).Elements.FirstOrDefault(); var alt = new Selectable(newNode.OuterHtml.Replace("alt=\"\"", "")); var name = alt.XPath("//img/@alt").GetValue(); var request = new Request() { Url = url, OwnerId = context.Response.Request.OwnerId }; request.AddProperty("tag", context.Selectable.XPath(".//div[@class='position gray']//div[1]//a[2]").GetValue()); request.AddProperty("referer", context.Response.Request.GetProperty("referer") ?? url); request.AddProperty("subject", name); ImageDownloader.GetInstance().AddRequest(request); } return(Task.FromResult(DataFlowResult.Success)); }
/// <summary> /// 获取主题的地址 /// </summary> /// <param name="context"></param> public static void GetSubjectUrl(DataFlowContext context) { var pages = context.Selectable .XPath("//*[@id=\"listdiv\"]/ul/li/div[@class='galleryli_title']/a/@href").GetValues(); var requestList = new List <Request>(); foreach (var page in pages) { var request = new Request { Url = page, OwnerId = context.Response.Request.OwnerId }; request.AddProperty("tag", context.Response.Request.GetProperty("tag")); request.AddProperty("referer", context.Response.Request.Url); requestList.Add(request); } if (requestList.Count > 0) { context.AddExtraRequests(requestList.ToArray()); } }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { var news = context.Selectable.XPath(".//div[@class='news_block']").Nodes(); foreach (var item in news) { var title = item.Select(Selectors.XPath(".//h2[@class='news_entry']")) .GetValue(ValueOption.InnerText); var url = item.Select(Selectors.XPath(".//h2[@class='news_entry']/a/@href")).GetValue(); var summary = item.Select(Selectors.XPath(".//div[@class='entry_summary']")) .GetValue(ValueOption.InnerText); var views = item.Select(Selectors.XPath(".//span[@class='view']")).GetValue(ValueOption.InnerText) .Replace(" 人浏览", ""); var request = CreateFromRequest(context.Response.Request, url); request.AddProperty("title", title); request.AddProperty("summary", summary); request.AddProperty("views", views); context.AddExtraRequests(request); } return(Task.FromResult(DataFlowResult.Success)); }
/// <summary> /// 解析画册的分页 /// </summary> /// <param name="context"></param> public static void GetSubjectPageUrl(DataFlowContext context) { var pageSet = new Dictionary <string, string>(); var pages = context.Selectable .XPath("//*[@id=\"listdiv\"]/div[@class='pagesYY']/div/a[not(@class)]/@href").GetValues(); var requestList = new List <Request>(); foreach (var page in pages) { if (!pageSet.ContainsKey(page)) { try { var request = new Request { Url = page, OwnerId = context.Response.Request.OwnerId }; //request.Properties.Add("tag", response.Request.Properties["tag"]); request.AddProperty("tag", "萝莉"); requestList.Add(request); pageSet.Add(page, page); } catch (Exception e) { Console.WriteLine(e); } } } if (requestList.Count > 0) { context.AddExtraRequests(requestList.ToArray()); } }