public override Task <DataFlowResult> Parse(DataFlowContext context) { var response = context.GetResponse(); //context.AddItem("URL", response.Request.Url); //context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); //var result = CheckType(response.Request.Url); Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("page 页:" + context.GetSelectable().XPath(".//title").GetValue()); Console.ForegroundColor = ConsoleColor.White; //var subs = context.GetSelectable().XPath("*[@id='post_rank']/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues(); //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[2]/ul/li/a[1]/@href").GetValue(); //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes().Count(); /*var pages = context.GetSelectable().XPath("//*[@id=\"listdiv\"]/div[3]/div/a[not(@class)]/@href").GetValues(); * foreach (var page in pages) * { * var request = new Request(); * request.Url = page; * request.OwnerId = response.Request.OwnerId; * context.AddTargetRequests(request); * }*/ return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { var response = context.GetResponse(); context.AddItem("URL", response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); Dictionary <string, string> tags = new Dictionary <string, string>(); var tagNodes = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes(); foreach (var node in tagNodes) { var url = node.XPath("./@href").GetValue(); var name = node.GetValue(); tags.Add(url, name); Console.WriteLine("url:" + url + " - name:" + name); } var requests = new List <Request>(); foreach (var tag in tags) { var request = new Request { Url = tag.Key, OwnerId = response.Request.OwnerId, }; request.Properties.Add("tag", tag.Value); requests.Add(request); } context.AddTargetRequests(requests.ToArray()); return(Task.FromResult(DataFlowResult.Success)); }
/// <summary> /// 取得详细图片查看的分页url /// </summary> /// <param name="context"></param> public static void GetDetailPageUrl(DataFlowContext context) { var response = context.GetResponse(); Dictionary <string, string> pageSet = new Dictionary <string, string>(); var pages = context.GetSelectable().XPath("//*[@id=\"pages\"]/a[not(@class)]/@href").GetValues(); var requestList = new List <Request>(); foreach (var page in pages) { if (!pageSet.ContainsKey(page)) { var request = new Request { Url = page, OwnerId = response.Request.OwnerId, }; request.AddProperty("tag", response.Request.GetProperty("tag")); request.AddProperty("referer", response.Request.GetProperty("referer")); requestList.Add(request); pageSet.Add(page, page); } } if (requestList.Count > 0) { context.AddTargetRequests(requestList.ToArray()); } }
public override Task <DataFlowResult> Parse(DataFlowContext context) { var response = context.GetResponse(); //context.AddItem("URL", response.Request.Url); //context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); Console.ForegroundColor = ConsoleColor.Blue; Console.WriteLine("第一页:" + context.GetSelectable().XPath(".//title").GetValue()); Console.ForegroundColor = ConsoleColor.White; //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes().Count(); Dictionary <string, string> pageSet = new Dictionary <string, string>(); var pages = context.GetSelectable().XPath("//*[@id=\"listdiv\"]/div[@class='pagesYY']/div/a[not(@class)]/@href").GetValues(); var requestList = new List <Request>(); foreach (var page in pages) { if (!pageSet.ContainsKey(page)) { var request = new Request(); request.Url = page; request.OwnerId = response.Request.OwnerId; requestList.Add(request); pageSet.Add(page, page); } } if (requestList.Count > 0) { context.AddTargetRequests(requestList.ToArray()); } return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { var response = context.GetResponse(); context.AddItem("URL", response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); return(Task.FromResult(DataFlowResult.Success)); }
protected override async Task <DataFlowResult> Store(DataFlowContext context) { var response = context.GetResponse(); var file = Path.Combine(GetDataFolder(response.Request.OwnerId), $"{response.Request.Hash}.json"); CreateFile(file); var items = context.GetItems(); await Writer.WriteLineAsync(JsonConvert.SerializeObject(items)); return(DataFlowResult.Success); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { var response = context.GetResponse(); if (response != null) { context.AddItem("URL", response.Request.Url); context.AddItem("Content", response.RawText); context.AddItem("TargetUrl", response.TargetUrl); context.AddItem("Success", response.Success); context.AddItem("ElapsedMilliseconds", response.ElapsedMilliseconds); } return(Task.FromResult(DataFlowResult.Success)); }
public override Task <DataFlowResult> Parse(DataFlowContext context) { var response = context.GetResponse(); context.AddItem("URL", response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); Dictionary <string, string> tags = new Dictionary <string, string>(); var tagNodes = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes(); foreach (var node in tagNodes) { var url = node.XPath("./@href").GetValue(); var name = node.GetValue(); tags.Add(url, name); Console.WriteLine("url:" + url + " - name:" + name); } var requests = new List <Request>(); foreach (var sub in tags) { var request = new Request { Url = sub.Key, OwnerId = response.Request.OwnerId }; requests.Add(request); CreateDirByTag(sub.Value); } context.AddTargetRequests(requests.ToArray()); /*var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues(); * var requests = new List<Request>(); * foreach (var sub in subs) * { * var request = new Request(); * request.Url = sub; * request.OwnerId = response.Request.OwnerId; * requests.Add(request); * Console.WriteLine("sub parse:" + sub); * } * context.AddTargetRequests(requests.ToArray());*/ return(Task.FromResult(DataFlowResult.Success)); }
public override async Task <DataFlowResult> HandleAsync(DataFlowContext context) { try { var response = context.GetResponse(); var request = response.Request; // 如果不匹配则终止数据流程 if (CanParse != null && !CanParse(request)) { return(DataFlowResult.Terminated); } Selectable?.Invoke(context); var parserResult = await Parse(context); if (parserResult == DataFlowResult.Failed || parserResult == DataFlowResult.Terminated) { return(parserResult); } var urls = Follow?.Invoke(context); if (urls != null && urls.Length > 0) { var followRequests = new List <Request>(); foreach (var url in urls) { var followRequest = CreateFromRequest(request, url); if (CanParse(followRequest)) { followRequests.Add(followRequest); } } context.AddTargetRequests(followRequests.ToArray()); } return(DataFlowResult.Success); } catch (Exception e) { Logger?.LogError($"数据解析发生异常: {e}"); return(DataFlowResult.Failed); } }
public override Task <DataFlowResult> Parse(DataFlowContext context) { var response = context.GetResponse(); context.AddItem("URL", response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); //var result = CheckType(response.Request.Url); //Console.WriteLine("type:"+result); //var subs = context.GetSelectable().XPath("*[@id='post_rank']/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues(); //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[2]/ul/li/a[1]/@href").GetValue(); //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes().Count(); var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues(); foreach (var sub in subs) { CreateFromRequest(response.Request, sub); } return(Task.FromResult(DataFlowResult.Success)); }
/// <summary> /// 获取图片浏览页里抽图片地址 /// </summary> /// <param name="context"></param> public static void GetDetailPictureUrl(DataFlowContext context) { var response = context.GetResponse(); context.AddItem("URL", response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); var images = context.GetSelectable().XPath("//*[@id=\"hgallery\"]/img/@src").GetValues(); foreach (var image in images) { //处理图片URL下载 var request = new Request { Url = image, OwnerId = response.Request.OwnerId }; request.AddProperty("tag", response.Request.GetProperty("tag")); request.AddProperty("referer", response.Request.GetProperty("referer")); request.AddProperty("subject", context.GetSelectable().XPath(".//title").GetValue()); ImageDownloader.GetInstance().AddRequest(request); } }
/// <summary> /// 获取主题的地址 /// </summary> /// <param name="context"></param> public static void GetSubjectUrl(DataFlowContext context) { var response = context.GetResponse(); var pages = context.GetSelectable().XPath("//*[@id=\"listdiv\"]/ul/li/div[@class='galleryli_title']/a/@href").GetValues(); var requestList = new List <Request>(); foreach (var page in pages) { var request = new Request { Url = page, OwnerId = response.Request.OwnerId, }; request.AddProperty("tag", response.Request.GetProperty("tag")); request.AddProperty("referer", response.Request.Url); requestList.Add(request); } if (requestList.Count > 0) { context.AddTargetRequests(requestList.ToArray()); } }
/// <summary> /// 解析画册的分页 /// </summary> /// <param name="context"></param> public static void GetSubjectPageUrl(DataFlowContext context) { var response = context.GetResponse(); Dictionary <string, string> pageSet = new Dictionary <string, string>(); var pages = context.GetSelectable().XPath("//*[@id=\"listdiv\"]/div[@class='pagesYY']/div/a[not(@class)]/@href").GetValues(); var requestList = new List <Request>(); foreach (var page in pages) { if (!pageSet.ContainsKey(page)) { try { var request = new Request { Url = page, OwnerId = response.Request.OwnerId, }; //request.Properties.Add("tag", response.Request.Properties["tag"]); request.AddProperty("tag", "萝莉"); requestList.Add(request); pageSet.Add(page, page); } catch (Exception e) { Console.WriteLine(e); } } } if (requestList.Count > 0) { context.AddTargetRequests(requestList.ToArray()); } }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { if (!context.Contains(Model.TypeName)) { context.Add(Model.TypeName, TableMetadata); } var selectable = context.GetSelectable(); List <dynamic> results = new List <dynamic>(); if (selectable.Properties == null) { selectable.Properties = new Dictionary <string, object>(); } var environments = new Dictionary <string, string>(); foreach (var property in context.GetResponse().Request.Properties) { environments.Add(property.Key, property.Value); } if (Model.ShareValueSelectors != null) { foreach (var selector in Model.ShareValueSelectors) { string name = selector.Name; var value = selectable.Select(selector.ToSelector()).GetValue(); if (!environments.ContainsKey(name)) { environments.Add(name, value); } else { environments[name] = value; } } } bool singleExtractor = Model.Selector == null; if (!singleExtractor) { var selector = Model.Selector.ToSelector(); var list = selectable.SelectList(selector).Nodes()?.ToList(); if (list != null) { if (Model.Take > 0 && list.Count > Model.Take) { list = Model.TakeFromHead ? list.Take(Model.Take).ToList() : list.Skip(list.Count - Model.Take).ToList(); } for (var i = 0; i < list.Count; ++i) { var item = list.ElementAt(i); var obj = ParseObject(environments, item, i); if (obj != null) { results.Add(obj); } else { Logger?.LogWarning($"解析到空数据,类型: {Model.TypeName}"); } } } } else { var obj = ParseObject(environments, selectable, 0); if (obj != null) { results.Add(obj); } else { Logger?.LogWarning($"解析到空数据,类型: {Model.TypeName}"); } } if (results.Count > 0) { var items = context.GetItem(Model.TypeName); if (items == null) { context.AddItem(Model.TypeName, results); } else { items.AddRange(results); } } return(Task.FromResult(DataFlowResult.Success)); }