Exemple #1
0
            public override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var response = context.GetResponse();

                //context.AddItem("URL", response.Request.Url);
                //context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

                Console.ForegroundColor = ConsoleColor.Blue;
                Console.WriteLine("第一页:" + context.GetSelectable().XPath(".//title").GetValue());
                Console.ForegroundColor = ConsoleColor.White;
                //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes().Count();
                Dictionary <string, string> pageSet = new Dictionary <string, string>();
                var pages       = context.GetSelectable().XPath("//*[@id=\"listdiv\"]/div[@class='pagesYY']/div/a[not(@class)]/@href").GetValues();
                var requestList = new List <Request>();

                foreach (var page in pages)
                {
                    if (!pageSet.ContainsKey(page))
                    {
                        var request = new Request();
                        request.Url     = page;
                        request.OwnerId = response.Request.OwnerId;
                        requestList.Add(request);

                        pageSet.Add(page, page);
                    }
                }

                if (requestList.Count > 0)
                {
                    context.AddTargetRequests(requestList.ToArray());
                }

                return(Task.FromResult(DataFlowResult.Success));
            }
Exemple #2
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var response = context.GetResponse();

                context.AddItem("URL", response.Request.Url);
                context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

                Dictionary <string, string> tags = new Dictionary <string, string>();
                var tagNodes = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes();

                foreach (var node in tagNodes)
                {
                    var url  = node.XPath("./@href").GetValue();
                    var name = node.GetValue();
                    tags.Add(url, name);
                    Console.WriteLine("url:" + url + " - name:" + name);
                }

                var requests = new List <Request>();

                foreach (var tag in tags)
                {
                    var request = new Request
                    {
                        Url     = tag.Key,
                        OwnerId = response.Request.OwnerId,
                    };
                    request.Properties.Add("tag", tag.Value);

                    requests.Add(request);
                }
                context.AddTargetRequests(requests.ToArray());

                return(Task.FromResult(DataFlowResult.Success));
            }
Exemple #3
0
        /// <summary>
        /// 取得详细图片查看的分页url
        /// </summary>
        /// <param name="context"></param>
        public static void GetDetailPageUrl(DataFlowContext context)
        {
            var response = context.GetResponse();
            Dictionary <string, string> pageSet = new Dictionary <string, string>();
            var pages       = context.GetSelectable().XPath("//*[@id=\"pages\"]/a[not(@class)]/@href").GetValues();
            var requestList = new List <Request>();

            foreach (var page in pages)
            {
                if (!pageSet.ContainsKey(page))
                {
                    var request = new Request
                    {
                        Url     = page,
                        OwnerId = response.Request.OwnerId,
                    };
                    request.AddProperty("tag", response.Request.GetProperty("tag"));
                    request.AddProperty("referer", response.Request.GetProperty("referer"));
                    requestList.Add(request);

                    pageSet.Add(page, page);
                }
            }

            if (requestList.Count > 0)
            {
                context.AddTargetRequests(requestList.ToArray());
            }
        }
Exemple #4
0
            public override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var response = context.GetResponse();

                //context.AddItem("URL", response.Request.Url);
                //context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

                //var result = CheckType(response.Request.Url);
                Console.ForegroundColor = ConsoleColor.Yellow;
                Console.WriteLine("page 页:" + context.GetSelectable().XPath(".//title").GetValue());
                Console.ForegroundColor = ConsoleColor.White;
                //var subs = context.GetSelectable().XPath("*[@id='post_rank']/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues();
                //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[2]/ul/li/a[1]/@href").GetValue();
                //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes().Count();

                /*var pages = context.GetSelectable().XPath("//*[@id=\"listdiv\"]/div[3]/div/a[not(@class)]/@href").GetValues();
                 * foreach (var page in pages)
                 * {
                 *  var request = new Request();
                 *  request.Url = page;
                 *  request.OwnerId = response.Request.OwnerId;
                 *  context.AddTargetRequests(request);
                 * }*/

                return(Task.FromResult(DataFlowResult.Success));
            }
Exemple #5
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                context.AddItem("URL", context.Response.Request.Url);
                context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

                return(Task.FromResult(DataFlowResult.Success));
            }
Exemple #6
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var selectable = context.GetSelectable();
                // 解析数据
                var author = selectable.XPath("//span[@class='p-name vcard-fullname d-block overflow-hidden']")
                             .GetValue();
                var name = selectable.XPath("//span[@class='p-nickname vcard-username d-block']")
                           .GetValue();

                context.AddItem("author", author);
                context.AddItem("username", name);

                // 添加目标链接
                var urls = selectable.Links().Regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").GetValues();

                AddFollowRequests(context, urls);

                // 如果解析为空,跳过后续步骤(存储 etc)
                if (string.IsNullOrWhiteSpace(name))
                {
                    context.ClearItems();
                    return(Task.FromResult(DataFlowResult.Terminated));
                }

                return(Task.FromResult(DataFlowResult.Success));
            }
Exemple #7
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var selectable = context.GetSelectable();
                // 解析数据
                var name = selectable.XPath("//*[@id=\"subject_list\"]/ul/li[1]/div[2]/h2/a")
                           .GetValue();
                var author = selectable.XPath("//*[@id=\"subject_list\"]/ul/li[1]/div[2]/div[1]")
                             .GetValue();

                context.AddItem("author", author);
                context.AddItem("username", name);

                // 添加目标链接
                var urls = selectable.Links().Regex("(https://book.douban\\.com/tag/[\\w\\-]+)").GetValues();

                AddTargetRequests(context, urls);

                // 如果解析为空,跳过后续步骤(存储 etc)
                if (string.IsNullOrWhiteSpace(name))
                {
                    context.ClearItems();
                    return(Task.FromResult(DataFlowResult.Terminated));
                }

                return(Task.FromResult(DataFlowResult.Success));
            }
Exemple #8
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var selectable = context.GetSelectable();
                // Parsing data
                var author = selectable.XPath("//span[@class='p-name vcard-fullname d-block overflow-hidden']")
                             .GetValue();
                var name = selectable.XPath("//span[@class='p-nickname vcard-username d-block']")
                           .GetValue();

                context.AddItem("author", author);
                context.AddItem("username", name);

                // Add target link
                var urls = selectable.Links().Regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").GetValues();

                AddTargetRequests(context, urls);

                // If the parsing is empty, skip the next step
                if (string.IsNullOrWhiteSpace(name))
                {
                    context.ClearItems();
                    return(Task.FromResult(DataFlowResult.Terminated));
                }

                return(Task.FromResult(DataFlowResult.Success));
            }
Exemple #9
0
            public override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var response = context.GetResponse();

                context.AddItem("URL", response.Request.Url);
                context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

                Dictionary <string, string> tags = new Dictionary <string, string>();
                var tagNodes = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes();

                foreach (var node in tagNodes)
                {
                    var url  = node.XPath("./@href").GetValue();
                    var name = node.GetValue();
                    tags.Add(url, name);
                    Console.WriteLine("url:" + url + " - name:" + name);
                }

                var requests = new List <Request>();

                foreach (var sub in tags)
                {
                    var request = new Request
                    {
                        Url     = sub.Key,
                        OwnerId = response.Request.OwnerId
                    };
                    requests.Add(request);

                    CreateDirByTag(sub.Value);
                }
                context.AddTargetRequests(requests.ToArray());

                /*var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues();
                 * var requests = new List<Request>();
                 * foreach (var sub in subs)
                 * {
                 *  var request = new Request();
                 *  request.Url = sub;
                 *  request.OwnerId = response.Request.OwnerId;
                 *  requests.Add(request);
                 *  Console.WriteLine("sub parse:" + sub);
                 * }
                 * context.AddTargetRequests(requests.ToArray());*/

                return(Task.FromResult(DataFlowResult.Success));
            }
            //public DatabaseSpiderDataParser()
            //{
            //	CanParse = DataParserHelper.CanParseByRegex("cnblogs\\.com");
            //	QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".");
            //}

            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                context.AddItem("URL", context.Response.Request.Url);
                context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

                #region add mysql database
                var typeName = typeof(EntitySpider.CnblogsEntry).FullName;
                var entity   = new EntitySpider.CnblogsEntry();
                context.Add(typeName, entity.GetTableMetadata());
                ParseResult <EntitySpider.CnblogsEntry> items = new ParseResult <EntitySpider.CnblogsEntry>();
                entity.WebSite = context.Response.Request.Url;
                entity.Url     = context.Response.Request.Url;
                entity.Title   = context.GetSelectable().XPath(".//title").GetValue();
                items.Add(entity);
                context.AddParseItem(typeName, items);
                #endregion
                return(Task.FromResult(DataFlowResult.Success));
            }
Exemple #11
0
        /// <summary>
        /// 获取图片浏览页里抽图片地址
        /// </summary>
        /// <param name="context"></param>
        public static void GetDetailPictureUrl(DataFlowContext context)
        {
            context.AddItem("URL", context.Response.Request.Url);
            context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

            var images = context.GetSelectable().XPath("//*[@id=\"hgallery\"]/img/@src").GetValues();

            foreach (var image in images)
            {
                //处理图片URL下载
                var request = new Request
                {
                    Url     = image,
                    OwnerId = context.Response.Request.OwnerId
                };
                request.AddProperty("tag", context.Response.Request.GetProperty("tag"));
                request.AddProperty("referer", context.Response.Request.GetProperty("referer"));
                request.AddProperty("subject", context.GetSelectable().XPath(".//title").GetValue());
                ImageDownloader.GetInstance().AddRequest(request);
            }
        }
Exemple #12
0
            public override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var response = context.GetResponse();

                context.AddItem("URL", response.Request.Url);
                context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

                //var result = CheckType(response.Request.Url);
                //Console.WriteLine("type:"+result);

                //var subs = context.GetSelectable().XPath("*[@id='post_rank']/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues();
                //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[2]/ul/li/a[1]/@href").GetValue();
                //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes().Count();
                var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues();

                foreach (var sub in subs)
                {
                    CreateFromRequest(response.Request, sub);
                }

                return(Task.FromResult(DataFlowResult.Success));
            }
Exemple #13
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                //context.AddItem("URL", response.Request.Url);
                //context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

                Console.ForegroundColor = ConsoleColor.Yellow;
                Console.WriteLine("page 页:" + context.GetSelectable().XPath(".//title").GetValue());
                Console.ForegroundColor = ConsoleColor.White;

                GetSubjectUrl(context);

                return(Task.FromResult(DataFlowResult.Success));
            }
Exemple #14
0
        protected override Task <DataFlowResult> Parse(DataFlowContext context)
        {
            var nextPageUrl = context.GetSelectable().XPath(_nextPageSelector).Links().GetValue();

            if (!string.IsNullOrWhiteSpace(nextPageUrl))
            {
                var followRequest = CreateFromRequest(context, nextPageUrl);
                followRequest.PageIndex = context.Response.Request.PageIndex + 1;
                if (CanParse == null || CanParse(followRequest))
                {
                    context.FollowRequests.Add(followRequest);
                }
            }
            return(Task.FromResult(DataFlowResult.Success));
        }
Exemple #15
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var news     = context.GetSelectable().XPath(".//[@class=\"news_block\"]").Nodes();
                var newsObjs = new List <News>();

                foreach (var item in news)
                {
                    var url     = item.Select(Selectors.XPath(".//h2[@class=\"news_entry\"]/a/@href")).GetValue();
                    var summary = item.Select(Selectors.XPath(".//div[@class=\"entry_summary\"]")).GetValue();
                    var views   = int.Parse(item.Select(Selectors.XPath(".//span[@class=\"view\"")).GetValue()
                                            .Replace("", " 人浏览"));
                    newsObjs.Add(new News
                    {
                    });
                }

                //context.AddItem("Title",);
                return(Task.FromResult(DataFlowResult.Success));
            }
Exemple #16
0
        /// <summary>
        /// 获取主题的地址
        /// </summary>
        /// <param name="context"></param>
        public static void GetSubjectUrl(DataFlowContext context)
        {
            var response    = context.GetResponse();
            var pages       = context.GetSelectable().XPath("//*[@id=\"listdiv\"]/ul/li/div[@class='galleryli_title']/a/@href").GetValues();
            var requestList = new List <Request>();

            foreach (var page in pages)
            {
                var request = new Request
                {
                    Url     = page,
                    OwnerId = response.Request.OwnerId,
                };
                request.AddProperty("tag", response.Request.GetProperty("tag"));
                request.AddProperty("referer", response.Request.Url);
                requestList.Add(request);
            }

            if (requestList.Count > 0)
            {
                context.AddTargetRequests(requestList.ToArray());
            }
        }
Exemple #17
0
        /// <summary>
        /// 解析画册的分页
        /// </summary>
        /// <param name="context"></param>
        public static void GetSubjectPageUrl(DataFlowContext context)
        {
            var response = context.GetResponse();
            Dictionary <string, string> pageSet = new Dictionary <string, string>();
            var pages       = context.GetSelectable().XPath("//*[@id=\"listdiv\"]/div[@class='pagesYY']/div/a[not(@class)]/@href").GetValues();
            var requestList = new List <Request>();

            foreach (var page in pages)
            {
                if (!pageSet.ContainsKey(page))
                {
                    try
                    {
                        var request = new Request
                        {
                            Url     = page,
                            OwnerId = response.Request.OwnerId,
                        };
                        //request.Properties.Add("tag", response.Request.Properties["tag"]);
                        request.AddProperty("tag", "萝莉");
                        requestList.Add(request);

                        pageSet.Add(page, page);
                    }
                    catch (Exception e)
                    {
                        Console.WriteLine(e);
                    }
                }
            }

            if (requestList.Count > 0)
            {
                context.AddTargetRequests(requestList.ToArray());
            }
        }
Exemple #18
0
        protected override Task <DataFlowResult> Parse(DataFlowContext context)
        {
            if (!context.Contains(_model.TypeName))
            {
                context.Add(_model.TypeName, _tableMetadata);
            }

            var selectable = context.GetSelectable();
            var results    = new ParseResult <T>();

            if (selectable.Properties == null)
            {
                selectable.Properties = new Dictionary <string, object>();
            }

            var environments = new Dictionary <string, string>();

            foreach (var property in context.Response.Request.Properties)
            {
                environments.Add(property.Key, property.Value);
            }

            if (_model.ShareValueSelectors != null)
            {
                foreach (var selector in _model.ShareValueSelectors)
                {
                    string name  = selector.Name;
                    var    value = selectable.Select(selector.ToSelector()).GetValue();
                    if (!environments.ContainsKey(name))
                    {
                        environments.Add(name, value);
                    }
                    else
                    {
                        environments[name] = value;
                    }
                }
            }

            bool singleExtractor = _model.Selector == null;

            if (!singleExtractor)
            {
                var selector = _model.Selector.ToSelector();

                var list = selectable.SelectList(selector).Nodes()?.ToList();
                if (list != null)
                {
                    if (_model.Take > 0 && list.Count > _model.Take)
                    {
                        list = _model.TakeFromHead
                                                        ? list.Take(_model.Take).ToList()
                                                        : list.Skip(list.Count - _model.Take).ToList();
                    }

                    for (var i = 0; i < list.Count; ++i)
                    {
                        var item = list.ElementAt(i);
                        var obj  = ParseObject(environments, item, i);
                        if (obj != null)
                        {
                            results.Add(obj);
                        }
                        else
                        {
                            Logger?.LogWarning($"解析到空数据,类型: {_model.TypeName}");
                        }
                    }
                }
            }
            else
            {
                var obj = ParseObject(environments, selectable, 0);
                if (obj != null)
                {
                    results.Add(obj);
                }
                else
                {
                    Logger?.LogWarning($"解析到空数据,类型: {_model.TypeName}");
                }
            }

            if (results.Count > 0)
            {
                var items = context.GetParseItem(_model.TypeName);
                if (items == null)
                {
                    context.AddParseItem(_model.TypeName, results);
                }
                else
                {
                    ((ParseResult <T>)items).AddRange(results);
                }
            }

            return(Task.FromResult(DataFlowResult.Success));
        }
Exemple #19
0
 protected override Task <DataFlowResult> Parse(DataFlowContext context)
 {
     if (_mapping != null)
     {
         if (_mapping.Deepth.GetValueOrDefault() >= 1)
         {
             if (context.Response.Request.Depth != _mapping.Deepth.Value)
             {
                 context.ClearItems();
                 return(Task.FromResult(DataFlowResult.Success));
             }
         }
         if (!string.IsNullOrWhiteSpace(_mapping.ItemCssSelector))
         {
             var items     = new List <dynamic>();
             var itemNodes = context.GetSelectable().XPath(_mapping.ItemCssSelector).Nodes();
             foreach (var note in itemNodes)
             {
                 var item = new Dictionary <string, string>();
                 foreach (var field in _mapping.Mapping)
                 {
                     item.Add(field.Field, note.XPath(field.CssSelector).GetValue());
                 }
                 if (item.Count > 0)
                 {
                     item.Add("PageSourceURL", context.Response.Request.Url);
                     items.Add(item);
                 }
             }
             if (items.Count > 0)
             {
                 context.AddItem("Content", JsonConvert.SerializeObject(items));
             }
         }
         else
         {
             if (_mapping.Mapping != null && _mapping.Mapping.Length > 0)
             {
                 var item = new Dictionary <string, string>();
                 foreach (var field in _mapping.Mapping)
                 {
                     var value = context.GetSelectable().XPath(field.CssSelector).GetValue();
                     if (value != null)
                     {
                         value = value.Replace("\t", "").Trim();
                     }
                     item.Add(field.Field, value);
                 }
                 if (item.Count > 0)
                 {
                     item.Add("PageSourceURL", context.Response.Request.Url);
                     context.AddItem("Content", JsonConvert.SerializeObject(item, Formatting.Indented));
                 }
             }
             else
             {
                 context.AddItem("PageSourceURL", context.Response.Request.Url);
                 context.AddItem("Content", context.Response.RawText);
             }
         }
     }
     //var item = context.GetSelectable().XPath("//h1[@class='title_news_detail mb10']").GetValue();
     //var item = context.GetSelectable().XPath("//h1[@class='title_news_detail mb10']").GetValue();
     //if (!string.IsNullOrWhiteSpace(item))
     //{
     //	//	context.AddItem("Vnexpress", item);
     //	context.AddItem("Content:", context.Response.RawText);
     //}
     //else
     //	context.ClearItems();
     return(Task.FromResult(DataFlowResult.Success));
 }
Exemple #20
0
 protected override Task <DataFlowResult> Parse(DataFlowContext context)
 {
     context.AddItem("URL", context.Response.Request.Url);
     context.AddItem("Quotes", context.GetSelectable().XPath("//div[@class='quote']").GetValues());
     return(Task.FromResult(DataFlowResult.Success));
 }