protected override Task <DataFlowResult> Parse(DataFlowContext context) { var selectable = context.GetSelectable(); // 解析数据 var author = selectable.XPath("//span[@class='p-name vcard-fullname d-block overflow-hidden']") .GetValue(); var name = selectable.XPath("//span[@class='p-nickname vcard-username d-block']") .GetValue(); context.AddItem("author", author); context.AddItem("username", name); // 添加目标链接 var urls = selectable.Links().Regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").GetValues(); AddFollowRequests(context, urls); // 如果解析为空,跳过后续步骤(存储 etc) if (string.IsNullOrWhiteSpace(name)) { context.ClearItems(); return(Task.FromResult(DataFlowResult.Terminated)); } return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { var selectable = context.GetSelectable(); // 解析数据 var name = selectable.XPath("//*[@id=\"subject_list\"]/ul/li[1]/div[2]/h2/a") .GetValue(); var author = selectable.XPath("//*[@id=\"subject_list\"]/ul/li[1]/div[2]/div[1]") .GetValue(); context.AddItem("author", author); context.AddItem("username", name); // 添加目标链接 var urls = selectable.Links().Regex("(https://book.douban\\.com/tag/[\\w\\-]+)").GetValues(); AddTargetRequests(context, urls); // 如果解析为空,跳过后续步骤(存储 etc) if (string.IsNullOrWhiteSpace(name)) { context.ClearItems(); return(Task.FromResult(DataFlowResult.Terminated)); } return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { var selectable = context.GetSelectable(); // Parsing data var author = selectable.XPath("//span[@class='p-name vcard-fullname d-block overflow-hidden']") .GetValue(); var name = selectable.XPath("//span[@class='p-nickname vcard-username d-block']") .GetValue(); context.AddItem("author", author); context.AddItem("username", name); // Add target link var urls = selectable.Links().Regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").GetValues(); AddTargetRequests(context, urls); // If the parsing is empty, skip the next step if (string.IsNullOrWhiteSpace(name)) { context.ClearItems(); return(Task.FromResult(DataFlowResult.Terminated)); } return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { if (_mapping != null) { if (_mapping.Deepth.GetValueOrDefault() >= 1) { if (context.Response.Request.Depth != _mapping.Deepth.Value) { context.ClearItems(); return(Task.FromResult(DataFlowResult.Success)); } } if (!string.IsNullOrWhiteSpace(_mapping.ItemCssSelector)) { var items = new List <dynamic>(); var itemNodes = context.GetSelectable().XPath(_mapping.ItemCssSelector).Nodes(); foreach (var note in itemNodes) { var item = new Dictionary <string, string>(); foreach (var field in _mapping.Mapping) { item.Add(field.Field, note.XPath(field.CssSelector).GetValue()); } if (item.Count > 0) { item.Add("PageSourceURL", context.Response.Request.Url); items.Add(item); } } if (items.Count > 0) { context.AddItem("Content", JsonConvert.SerializeObject(items)); } } else { if (_mapping.Mapping != null && _mapping.Mapping.Length > 0) { var item = new Dictionary <string, string>(); foreach (var field in _mapping.Mapping) { var value = context.GetSelectable().XPath(field.CssSelector).GetValue(); if (value != null) { value = value.Replace("\t", "").Trim(); } item.Add(field.Field, value); } if (item.Count > 0) { item.Add("PageSourceURL", context.Response.Request.Url); context.AddItem("Content", JsonConvert.SerializeObject(item, Formatting.Indented)); } } else { context.AddItem("PageSourceURL", context.Response.Request.Url); context.AddItem("Content", context.Response.RawText); } } } //var item = context.GetSelectable().XPath("//h1[@class='title_news_detail mb10']").GetValue(); //var item = context.GetSelectable().XPath("//h1[@class='title_news_detail mb10']").GetValue(); //if (!string.IsNullOrWhiteSpace(item)) //{ // // context.AddItem("Vnexpress", item); // context.AddItem("Content:", context.Response.RawText); //} //else // context.ClearItems(); return(Task.FromResult(DataFlowResult.Success)); }