Beispiel #1
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var selectable = context.GetSelectable();
                // 解析数据
                var author = selectable.XPath("//span[@class='p-name vcard-fullname d-block overflow-hidden']")
                             .GetValue();
                var name = selectable.XPath("//span[@class='p-nickname vcard-username d-block']")
                           .GetValue();

                context.AddItem("author", author);
                context.AddItem("username", name);

                // 添加目标链接
                var urls = selectable.Links().Regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").GetValues();

                AddFollowRequests(context, urls);

                // 如果解析为空,跳过后续步骤(存储 etc)
                if (string.IsNullOrWhiteSpace(name))
                {
                    context.ClearItems();
                    return(Task.FromResult(DataFlowResult.Terminated));
                }

                return(Task.FromResult(DataFlowResult.Success));
            }
Beispiel #2
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                context.AddItem("URL", context.Response.Request.Url);
                context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

                return(Task.FromResult(DataFlowResult.Success));
            }
Beispiel #3
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var response = context.GetResponse();

                context.AddItem("URL", response.Request.Url);
                context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

                Dictionary <string, string> tags = new Dictionary <string, string>();
                var tagNodes = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes();

                foreach (var node in tagNodes)
                {
                    var url  = node.XPath("./@href").GetValue();
                    var name = node.GetValue();
                    tags.Add(url, name);
                    Console.WriteLine("url:" + url + " - name:" + name);
                }

                var requests = new List <Request>();

                foreach (var tag in tags)
                {
                    var request = new Request
                    {
                        Url     = tag.Key,
                        OwnerId = response.Request.OwnerId,
                    };
                    request.Properties.Add("tag", tag.Value);

                    requests.Add(request);
                }
                context.AddTargetRequests(requests.ToArray());

                return(Task.FromResult(DataFlowResult.Success));
            }
Beispiel #4
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var selectable = context.GetSelectable();
                // 解析数据
                var name = selectable.XPath("//*[@id=\"subject_list\"]/ul/li[1]/div[2]/h2/a")
                           .GetValue();
                var author = selectable.XPath("//*[@id=\"subject_list\"]/ul/li[1]/div[2]/div[1]")
                             .GetValue();

                context.AddItem("author", author);
                context.AddItem("username", name);

                // 添加目标链接
                var urls = selectable.Links().Regex("(https://book.douban\\.com/tag/[\\w\\-]+)").GetValues();

                AddTargetRequests(context, urls);

                // 如果解析为空,跳过后续步骤(存储 etc)
                if (string.IsNullOrWhiteSpace(name))
                {
                    context.ClearItems();
                    return(Task.FromResult(DataFlowResult.Terminated));
                }

                return(Task.FromResult(DataFlowResult.Success));
            }
Beispiel #5
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var selectable = context.GetSelectable();
                // Parsing data
                var author = selectable.XPath("//span[@class='p-name vcard-fullname d-block overflow-hidden']")
                             .GetValue();
                var name = selectable.XPath("//span[@class='p-nickname vcard-username d-block']")
                           .GetValue();

                context.AddItem("author", author);
                context.AddItem("username", name);

                // Add target link
                var urls = selectable.Links().Regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").GetValues();

                AddTargetRequests(context, urls);

                // If the parsing is empty, skip the next step
                if (string.IsNullOrWhiteSpace(name))
                {
                    context.ClearItems();
                    return(Task.FromResult(DataFlowResult.Terminated));
                }

                return(Task.FromResult(DataFlowResult.Success));
            }
Beispiel #6
0
 protected override Task <DataFlowResult> Parse(DataFlowContext context)
 {
     if (context.Response != null)
     {
         context.AddItem("URL", context.Response.Request.Url);
         context.AddItem("Content", context.Response.RawText);
         context.AddItem("TargetUrl", context.Response.TargetUrl);
         context.AddItem("Success", context.Response.Success);
         context.AddItem("ElapsedMilliseconds", context.Response.ElapsedMilliseconds);
     }
     return(Task.FromResult(DataFlowResult.Success));
 }
        public async Task Store_Should_Success()
        {
            var serviceProvider = Mock.Of <IServiceProvider>();

            var mongoCollection = new Mock <IMongoCollection <BsonDocument> >();

            var mongoDatabase = new Mock <IMongoDatabase>();

            mongoDatabase.Setup(d =>
                                d.GetCollection <BsonDocument>(It.IsAny <string>(), It.IsAny <MongoCollectionSettings>()))
            .Returns(mongoCollection.Object);

            var mongoClient = new Mock <IMongoClient>();

            mongoClient.Setup(d => d.GetDatabase(It.IsAny <string>(), It.IsAny <MongoDatabaseSettings>()))
            .Returns(mongoDatabase.Object);

            var mongoEntityStorage = new MongoEntityStorage(mongoClient.Object);

            var tableMetadata = new TableMetadata {
                Schema = new Schema("db", "table")
            };

            var dataFlowContext = new DataFlowContext(serviceProvider);

            dataFlowContext.Add("table", tableMetadata);
            dataFlowContext.AddItem("table", new object[] { new Dictionary <string, object> {
                                                                { "Name", "Value" }
                                                            } });

            var result = await mongoEntityStorage.HandleAsync(dataFlowContext);

            Assert.Equal(DataFlowResult.Success, result);
        }
Beispiel #8
0
            public override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var response = context.GetResponse();

                context.AddItem("URL", response.Request.Url);
                context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

                Dictionary <string, string> tags = new Dictionary <string, string>();
                var tagNodes = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes();

                foreach (var node in tagNodes)
                {
                    var url  = node.XPath("./@href").GetValue();
                    var name = node.GetValue();
                    tags.Add(url, name);
                    Console.WriteLine("url:" + url + " - name:" + name);
                }

                var requests = new List <Request>();

                foreach (var sub in tags)
                {
                    var request = new Request
                    {
                        Url     = sub.Key,
                        OwnerId = response.Request.OwnerId
                    };
                    requests.Add(request);

                    CreateDirByTag(sub.Value);
                }
                context.AddTargetRequests(requests.ToArray());

                /*var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues();
                 * var requests = new List<Request>();
                 * foreach (var sub in subs)
                 * {
                 *  var request = new Request();
                 *  request.Url = sub;
                 *  request.OwnerId = response.Request.OwnerId;
                 *  requests.Add(request);
                 *  Console.WriteLine("sub parse:" + sub);
                 * }
                 * context.AddTargetRequests(requests.ToArray());*/

                return(Task.FromResult(DataFlowResult.Success));
            }
            //public DatabaseSpiderDataParser()
            //{
            //	CanParse = DataParserHelper.CanParseByRegex("cnblogs\\.com");
            //	QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".");
            //}

            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                context.AddItem("URL", context.Response.Request.Url);
                context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

                #region add mysql database
                var typeName = typeof(EntitySpider.CnblogsEntry).FullName;
                var entity   = new EntitySpider.CnblogsEntry();
                context.Add(typeName, entity.GetTableMetadata());
                ParseResult <EntitySpider.CnblogsEntry> items = new ParseResult <EntitySpider.CnblogsEntry>();
                entity.WebSite = context.Response.Request.Url;
                entity.Url     = context.Response.Request.Url;
                entity.Title   = context.GetSelectable().XPath(".//title").GetValue();
                items.Add(entity);
                context.AddParseItem(typeName, items);
                #endregion
                return(Task.FromResult(DataFlowResult.Success));
            }
Beispiel #10
0
        /// <summary>
        /// 获取图片浏览页里抽图片地址
        /// </summary>
        /// <param name="context"></param>
        public static void GetDetailPictureUrl(DataFlowContext context)
        {
            context.AddItem("URL", context.Response.Request.Url);
            context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

            var images = context.GetSelectable().XPath("//*[@id=\"hgallery\"]/img/@src").GetValues();

            foreach (var image in images)
            {
                //处理图片URL下载
                var request = new Request
                {
                    Url     = image,
                    OwnerId = context.Response.Request.OwnerId
                };
                request.AddProperty("tag", context.Response.Request.GetProperty("tag"));
                request.AddProperty("referer", context.Response.Request.GetProperty("referer"));
                request.AddProperty("subject", context.GetSelectable().XPath(".//title").GetValue());
                ImageDownloader.GetInstance().AddRequest(request);
            }
        }
Beispiel #11
0
            public override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var response = context.GetResponse();

                context.AddItem("URL", response.Request.Url);
                context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

                //var result = CheckType(response.Request.Url);
                //Console.WriteLine("type:"+result);

                //var subs = context.GetSelectable().XPath("*[@id='post_rank']/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues();
                //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[2]/ul/li/a[1]/@href").GetValue();
                //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes().Count();
                var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues();

                foreach (var sub in subs)
                {
                    CreateFromRequest(response.Request, sub);
                }

                return(Task.FromResult(DataFlowResult.Success));
            }
Beispiel #12
0
 protected override Task <DataFlowResult> Parse(DataFlowContext context)
 {
     context.AddItem("URL", context.Response.Request.Url);
     context.AddItem("Quotes", context.GetSelectable().XPath("//div[@class='quote']").GetValues());
     return(Task.FromResult(DataFlowResult.Success));
 }
Beispiel #13
0
 protected override Task <DataFlowResult> Parse(DataFlowContext context)
 {
     if (_mapping != null)
     {
         if (_mapping.Deepth.GetValueOrDefault() >= 1)
         {
             if (context.Response.Request.Depth != _mapping.Deepth.Value)
             {
                 context.ClearItems();
                 return(Task.FromResult(DataFlowResult.Success));
             }
         }
         if (!string.IsNullOrWhiteSpace(_mapping.ItemCssSelector))
         {
             var items     = new List <dynamic>();
             var itemNodes = context.GetSelectable().XPath(_mapping.ItemCssSelector).Nodes();
             foreach (var note in itemNodes)
             {
                 var item = new Dictionary <string, string>();
                 foreach (var field in _mapping.Mapping)
                 {
                     item.Add(field.Field, note.XPath(field.CssSelector).GetValue());
                 }
                 if (item.Count > 0)
                 {
                     item.Add("PageSourceURL", context.Response.Request.Url);
                     items.Add(item);
                 }
             }
             if (items.Count > 0)
             {
                 context.AddItem("Content", JsonConvert.SerializeObject(items));
             }
         }
         else
         {
             if (_mapping.Mapping != null && _mapping.Mapping.Length > 0)
             {
                 var item = new Dictionary <string, string>();
                 foreach (var field in _mapping.Mapping)
                 {
                     var value = context.GetSelectable().XPath(field.CssSelector).GetValue();
                     if (value != null)
                     {
                         value = value.Replace("\t", "").Trim();
                     }
                     item.Add(field.Field, value);
                 }
                 if (item.Count > 0)
                 {
                     item.Add("PageSourceURL", context.Response.Request.Url);
                     context.AddItem("Content", JsonConvert.SerializeObject(item, Formatting.Indented));
                 }
             }
             else
             {
                 context.AddItem("PageSourceURL", context.Response.Request.Url);
                 context.AddItem("Content", context.Response.RawText);
             }
         }
     }
     //var item = context.GetSelectable().XPath("//h1[@class='title_news_detail mb10']").GetValue();
     //var item = context.GetSelectable().XPath("//h1[@class='title_news_detail mb10']").GetValue();
     //if (!string.IsNullOrWhiteSpace(item))
     //{
     //	//	context.AddItem("Vnexpress", item);
     //	context.AddItem("Content:", context.Response.RawText);
     //}
     //else
     //	context.ClearItems();
     return(Task.FromResult(DataFlowResult.Success));
 }
Beispiel #14
0
        protected override Task <DataFlowResult> Parse(DataFlowContext context)
        {
            if (!context.Contains(Model.TypeName))
            {
                context.Add(Model.TypeName, TableMetadata);
            }

            var            selectable = context.GetSelectable();
            List <dynamic> results    = new List <dynamic>();

            if (selectable.Properties == null)
            {
                selectable.Properties = new Dictionary <string, object>();
            }

            var environments = new Dictionary <string, string>();

            foreach (var property in context.GetResponse().Request.Properties)
            {
                environments.Add(property.Key, property.Value);
            }

            if (Model.ShareValueSelectors != null)
            {
                foreach (var selector in Model.ShareValueSelectors)
                {
                    string name  = selector.Name;
                    var    value = selectable.Select(selector.ToSelector()).GetValue();
                    if (!environments.ContainsKey(name))
                    {
                        environments.Add(name, value);
                    }
                    else
                    {
                        environments[name] = value;
                    }
                }
            }

            bool singleExtractor = Model.Selector == null;

            if (!singleExtractor)
            {
                var selector = Model.Selector.ToSelector();

                var list = selectable.SelectList(selector).Nodes()?.ToList();
                if (list != null)
                {
                    if (Model.Take > 0 && list.Count > Model.Take)
                    {
                        list = Model.TakeFromHead
                            ? list.Take(Model.Take).ToList()
                            : list.Skip(list.Count - Model.Take).ToList();
                    }

                    for (var i = 0; i < list.Count; ++i)
                    {
                        var item = list.ElementAt(i);
                        var obj  = ParseObject(environments, item, i);
                        if (obj != null)
                        {
                            results.Add(obj);
                        }
                        else
                        {
                            Logger?.LogWarning($"解析到空数据,类型: {Model.TypeName}");
                        }
                    }
                }
            }
            else
            {
                var obj = ParseObject(environments, selectable, 0);
                if (obj != null)
                {
                    results.Add(obj);
                }
                else
                {
                    Logger?.LogWarning($"解析到空数据,类型: {Model.TypeName}");
                }
            }

            if (results.Count > 0)
            {
                var items = context.GetItem(Model.TypeName);
                if (items == null)
                {
                    context.AddItem(Model.TypeName, results);
                }
                else
                {
                    items.AddRange(results);
                }
            }

            return(Task.FromResult(DataFlowResult.Success));
        }