protected override Task <DataFlowResult> Parse(DataFlowContext context) { var selectable = context.GetSelectable(); // 解析数据 var author = selectable.XPath("//span[@class='p-name vcard-fullname d-block overflow-hidden']") .GetValue(); var name = selectable.XPath("//span[@class='p-nickname vcard-username d-block']") .GetValue(); context.AddItem("author", author); context.AddItem("username", name); // 添加目标链接 var urls = selectable.Links().Regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").GetValues(); AddFollowRequests(context, urls); // 如果解析为空,跳过后续步骤(存储 etc) if (string.IsNullOrWhiteSpace(name)) { context.ClearItems(); return(Task.FromResult(DataFlowResult.Terminated)); } return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { context.AddItem("URL", context.Response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { var response = context.GetResponse(); context.AddItem("URL", response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); Dictionary <string, string> tags = new Dictionary <string, string>(); var tagNodes = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes(); foreach (var node in tagNodes) { var url = node.XPath("./@href").GetValue(); var name = node.GetValue(); tags.Add(url, name); Console.WriteLine("url:" + url + " - name:" + name); } var requests = new List <Request>(); foreach (var tag in tags) { var request = new Request { Url = tag.Key, OwnerId = response.Request.OwnerId, }; request.Properties.Add("tag", tag.Value); requests.Add(request); } context.AddTargetRequests(requests.ToArray()); return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { var selectable = context.GetSelectable(); // 解析数据 var name = selectable.XPath("//*[@id=\"subject_list\"]/ul/li[1]/div[2]/h2/a") .GetValue(); var author = selectable.XPath("//*[@id=\"subject_list\"]/ul/li[1]/div[2]/div[1]") .GetValue(); context.AddItem("author", author); context.AddItem("username", name); // 添加目标链接 var urls = selectable.Links().Regex("(https://book.douban\\.com/tag/[\\w\\-]+)").GetValues(); AddTargetRequests(context, urls); // 如果解析为空,跳过后续步骤(存储 etc) if (string.IsNullOrWhiteSpace(name)) { context.ClearItems(); return(Task.FromResult(DataFlowResult.Terminated)); } return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { var selectable = context.GetSelectable(); // Parsing data var author = selectable.XPath("//span[@class='p-name vcard-fullname d-block overflow-hidden']") .GetValue(); var name = selectable.XPath("//span[@class='p-nickname vcard-username d-block']") .GetValue(); context.AddItem("author", author); context.AddItem("username", name); // Add target link var urls = selectable.Links().Regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").GetValues(); AddTargetRequests(context, urls); // If the parsing is empty, skip the next step if (string.IsNullOrWhiteSpace(name)) { context.ClearItems(); return(Task.FromResult(DataFlowResult.Terminated)); } return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { if (context.Response != null) { context.AddItem("URL", context.Response.Request.Url); context.AddItem("Content", context.Response.RawText); context.AddItem("TargetUrl", context.Response.TargetUrl); context.AddItem("Success", context.Response.Success); context.AddItem("ElapsedMilliseconds", context.Response.ElapsedMilliseconds); } return(Task.FromResult(DataFlowResult.Success)); }
public async Task Store_Should_Success() { var serviceProvider = Mock.Of <IServiceProvider>(); var mongoCollection = new Mock <IMongoCollection <BsonDocument> >(); var mongoDatabase = new Mock <IMongoDatabase>(); mongoDatabase.Setup(d => d.GetCollection <BsonDocument>(It.IsAny <string>(), It.IsAny <MongoCollectionSettings>())) .Returns(mongoCollection.Object); var mongoClient = new Mock <IMongoClient>(); mongoClient.Setup(d => d.GetDatabase(It.IsAny <string>(), It.IsAny <MongoDatabaseSettings>())) .Returns(mongoDatabase.Object); var mongoEntityStorage = new MongoEntityStorage(mongoClient.Object); var tableMetadata = new TableMetadata { Schema = new Schema("db", "table") }; var dataFlowContext = new DataFlowContext(serviceProvider); dataFlowContext.Add("table", tableMetadata); dataFlowContext.AddItem("table", new object[] { new Dictionary <string, object> { { "Name", "Value" } } }); var result = await mongoEntityStorage.HandleAsync(dataFlowContext); Assert.Equal(DataFlowResult.Success, result); }
public override Task <DataFlowResult> Parse(DataFlowContext context) { var response = context.GetResponse(); context.AddItem("URL", response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); Dictionary <string, string> tags = new Dictionary <string, string>(); var tagNodes = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes(); foreach (var node in tagNodes) { var url = node.XPath("./@href").GetValue(); var name = node.GetValue(); tags.Add(url, name); Console.WriteLine("url:" + url + " - name:" + name); } var requests = new List <Request>(); foreach (var sub in tags) { var request = new Request { Url = sub.Key, OwnerId = response.Request.OwnerId }; requests.Add(request); CreateDirByTag(sub.Value); } context.AddTargetRequests(requests.ToArray()); /*var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues(); * var requests = new List<Request>(); * foreach (var sub in subs) * { * var request = new Request(); * request.Url = sub; * request.OwnerId = response.Request.OwnerId; * requests.Add(request); * Console.WriteLine("sub parse:" + sub); * } * context.AddTargetRequests(requests.ToArray());*/ return(Task.FromResult(DataFlowResult.Success)); }
//public DatabaseSpiderDataParser() //{ // CanParse = DataParserHelper.CanParseByRegex("cnblogs\\.com"); // QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath("."); //} protected override Task <DataFlowResult> Parse(DataFlowContext context) { context.AddItem("URL", context.Response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); #region add mysql database var typeName = typeof(EntitySpider.CnblogsEntry).FullName; var entity = new EntitySpider.CnblogsEntry(); context.Add(typeName, entity.GetTableMetadata()); ParseResult <EntitySpider.CnblogsEntry> items = new ParseResult <EntitySpider.CnblogsEntry>(); entity.WebSite = context.Response.Request.Url; entity.Url = context.Response.Request.Url; entity.Title = context.GetSelectable().XPath(".//title").GetValue(); items.Add(entity); context.AddParseItem(typeName, items); #endregion return(Task.FromResult(DataFlowResult.Success)); }
/// <summary> /// 获取图片浏览页里抽图片地址 /// </summary> /// <param name="context"></param> public static void GetDetailPictureUrl(DataFlowContext context) { context.AddItem("URL", context.Response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); var images = context.GetSelectable().XPath("//*[@id=\"hgallery\"]/img/@src").GetValues(); foreach (var image in images) { //处理图片URL下载 var request = new Request { Url = image, OwnerId = context.Response.Request.OwnerId }; request.AddProperty("tag", context.Response.Request.GetProperty("tag")); request.AddProperty("referer", context.Response.Request.GetProperty("referer")); request.AddProperty("subject", context.GetSelectable().XPath(".//title").GetValue()); ImageDownloader.GetInstance().AddRequest(request); } }
public override Task <DataFlowResult> Parse(DataFlowContext context) { var response = context.GetResponse(); context.AddItem("URL", response.Request.Url); context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue()); //var result = CheckType(response.Request.Url); //Console.WriteLine("type:"+result); //var subs = context.GetSelectable().XPath("*[@id='post_rank']/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues(); //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[2]/ul/li/a[1]/@href").GetValue(); //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes().Count(); var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues(); foreach (var sub in subs) { CreateFromRequest(response.Request, sub); } return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { context.AddItem("URL", context.Response.Request.Url); context.AddItem("Quotes", context.GetSelectable().XPath("//div[@class='quote']").GetValues()); return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { if (_mapping != null) { if (_mapping.Deepth.GetValueOrDefault() >= 1) { if (context.Response.Request.Depth != _mapping.Deepth.Value) { context.ClearItems(); return(Task.FromResult(DataFlowResult.Success)); } } if (!string.IsNullOrWhiteSpace(_mapping.ItemCssSelector)) { var items = new List <dynamic>(); var itemNodes = context.GetSelectable().XPath(_mapping.ItemCssSelector).Nodes(); foreach (var note in itemNodes) { var item = new Dictionary <string, string>(); foreach (var field in _mapping.Mapping) { item.Add(field.Field, note.XPath(field.CssSelector).GetValue()); } if (item.Count > 0) { item.Add("PageSourceURL", context.Response.Request.Url); items.Add(item); } } if (items.Count > 0) { context.AddItem("Content", JsonConvert.SerializeObject(items)); } } else { if (_mapping.Mapping != null && _mapping.Mapping.Length > 0) { var item = new Dictionary <string, string>(); foreach (var field in _mapping.Mapping) { var value = context.GetSelectable().XPath(field.CssSelector).GetValue(); if (value != null) { value = value.Replace("\t", "").Trim(); } item.Add(field.Field, value); } if (item.Count > 0) { item.Add("PageSourceURL", context.Response.Request.Url); context.AddItem("Content", JsonConvert.SerializeObject(item, Formatting.Indented)); } } else { context.AddItem("PageSourceURL", context.Response.Request.Url); context.AddItem("Content", context.Response.RawText); } } } //var item = context.GetSelectable().XPath("//h1[@class='title_news_detail mb10']").GetValue(); //var item = context.GetSelectable().XPath("//h1[@class='title_news_detail mb10']").GetValue(); //if (!string.IsNullOrWhiteSpace(item)) //{ // // context.AddItem("Vnexpress", item); // context.AddItem("Content:", context.Response.RawText); //} //else // context.ClearItems(); return(Task.FromResult(DataFlowResult.Success)); }
protected override Task <DataFlowResult> Parse(DataFlowContext context) { if (!context.Contains(Model.TypeName)) { context.Add(Model.TypeName, TableMetadata); } var selectable = context.GetSelectable(); List <dynamic> results = new List <dynamic>(); if (selectable.Properties == null) { selectable.Properties = new Dictionary <string, object>(); } var environments = new Dictionary <string, string>(); foreach (var property in context.GetResponse().Request.Properties) { environments.Add(property.Key, property.Value); } if (Model.ShareValueSelectors != null) { foreach (var selector in Model.ShareValueSelectors) { string name = selector.Name; var value = selectable.Select(selector.ToSelector()).GetValue(); if (!environments.ContainsKey(name)) { environments.Add(name, value); } else { environments[name] = value; } } } bool singleExtractor = Model.Selector == null; if (!singleExtractor) { var selector = Model.Selector.ToSelector(); var list = selectable.SelectList(selector).Nodes()?.ToList(); if (list != null) { if (Model.Take > 0 && list.Count > Model.Take) { list = Model.TakeFromHead ? list.Take(Model.Take).ToList() : list.Skip(list.Count - Model.Take).ToList(); } for (var i = 0; i < list.Count; ++i) { var item = list.ElementAt(i); var obj = ParseObject(environments, item, i); if (obj != null) { results.Add(obj); } else { Logger?.LogWarning($"解析到空数据,类型: {Model.TypeName}"); } } } } else { var obj = ParseObject(environments, selectable, 0); if (obj != null) { results.Add(obj); } else { Logger?.LogWarning($"解析到空数据,类型: {Model.TypeName}"); } } if (results.Count > 0) { var items = context.GetItem(Model.TypeName); if (items == null) { context.AddItem(Model.TypeName, results); } else { items.AddRange(results); } } return(Task.FromResult(DataFlowResult.Success)); }