Пример #1
0
        protected override Task <DataFlowResult> Parse(DataFlowContext context)
        {
            Dictionary <string, string> tags = new Dictionary <string, string>();
            var tagNodes = context.Selectable.XPath(".//div[@class='TypeList']//ul//li").Nodes();

            foreach (var node in tagNodes)
            {
                var url  = node.XPath(".//a[@class='TypeBigPics']/@href").GetValue();
                var name = node.XPath(".//div[@class='ListTit']").GetValue();
                tags.Add(url, name);
            }
            var requests = new List <Request>();

            foreach (var tag in tags)
            {
                var request = new Request
                {
                    Url     = tag.Key,
                    OwnerId = context.Response.Request.OwnerId
                };
                //Console.WriteLine(tag.Key);
                request.AddProperty("tag", tag.Value);
                request.AddProperty("referer", context.Response.Request.GetProperty("referer") ?? tag.Key);
                request.AddProperty("subject", context.Selectable.XPath(".//title").GetValue());
                requests.Add(request);
            }

            //如果当前为第一页找到最后一页并将2-最后一页加入到请求中
            var thisPage = context.Selectable.XPath(".//div[@class='NewPages']//ul//li[3]//a").GetValue();

            if (thisPage == "2")
            {
                tagNodes = context.Selectable.XPath(".//div[@class='NewPages']//ul//li[last()]//a").Nodes();
                foreach (var node in tagNodes)
                {
                    var href = node.XPath("@href").Regex("\\d{1,3}.htm").GetValue().Replace(".htm", "");
                    int.TryParse(href, out var totalPage);
                    var       reg     = new Regex("\\d{1,3}.htm");
                    Request[] reqArry = new Request[totalPage - 1];
                    for (int i = 2; i <= totalPage; i++)
                    {
                        var url     = reg.Replace(context.Response.Request.Url, $"{i}.htm");
                        var request = new Request()
                        {
                            Url = url, OwnerId = context.Response.Request.OwnerId
                        };
                        reqArry[i - 2] = request;
                    }
                    context.AddExtraRequests(reqArry);
                }
            }
            context.AddExtraRequests(requests.ToArray());
            return(Task.FromResult(DataFlowResult.Success));
        }
Пример #2
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                context.AddData("URL", context.Response.Request.Url);
                context.AddData("Title", context.Selectable.XPath(".//title").GetValue());

                var tags     = new Dictionary <string, string>();
                var tagNodes = context.Selectable
                               .XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes();

                foreach (var node in tagNodes)
                {
                    var url  = node.XPath("./@href").GetValue();
                    var name = node.GetValue();
                    tags.Add(url, name);
                    Console.WriteLine("url:" + url + " - name:" + name);
                }

                var requests = new List <Request>();

                foreach (var tag in tags)
                {
                    var request = new Request
                    {
                        Url     = tag.Key,
                        OwnerId = context.Response.Request.OwnerId
                    };
                    request.AddProperty("tag", tag.Value);

                    requests.Add(request);
                }

                context.AddExtraRequests(requests.ToArray());

                return(Task.FromResult(DataFlowResult.Success));
            }
Пример #3
0
        /// <summary>
        /// 取得详细图片查看的分页url
        /// </summary>
        /// <param name="context"></param>
        public static void GetDetailPageUrl(DataFlowContext context)
        {
            var pageSet     = new Dictionary <string, string>();
            var pages       = context.Selectable.XPath("//*[@id=\"pages\"]/a[not(@class)]/@href").GetValues();
            var requestList = new List <Request>();

            foreach (var page in pages)
            {
                if (!pageSet.ContainsKey(page))
                {
                    var request = new Request
                    {
                        Url     = page,
                        OwnerId = context.Response.Request.OwnerId
                    };
                    request.AddProperty("tag", context.Response.Request.GetProperty("tag"));
                    request.AddProperty("referer", context.Response.Request.GetProperty("referer"));
                    requestList.Add(request);

                    pageSet.Add(page, page);
                }
            }

            if (requestList.Count > 0)
            {
                context.AddExtraRequests(requestList.ToArray());
            }
        }
Пример #4
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var selectable = context.Selectable;
                // 解析数据
                var author = selectable.XPath("//span[@class='p-name vcard-fullname d-block overflow-hidden']")
                             .GetValue();
                var name = selectable.XPath("//span[@class='p-nickname vcard-username d-block']")
                           .GetValue();

                context.AddData("author", author);
                context.AddData("username", name);

                // 添加目标链接
                var urls = selectable.Links().Regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").GetValues();

                foreach (var url in urls)
                {
                    context.AddExtraRequests(CreateFromRequest(context.Response.Request, url));
                }


                // 如果解析为空,跳过后续步骤(存储 etc)
                if (string.IsNullOrWhiteSpace(name))
                {
                    context.ClearData();
                    return(Task.FromResult(DataFlowResult.Terminated));
                }

                return(Task.FromResult(DataFlowResult.Success));
            }
Пример #5
0
        /// <summary>
        /// 数据解析
        /// </summary>
        /// <param name="context">处理上下文</param>
        /// <returns></returns>
        public override async Task <DataFlowResult> HandleAsync(DataFlowContext context)
        {
            if (context?.Response == null)
            {
                Logger?.LogError("数据上下文或者响应内容为空");
                return(DataFlowResult.Failed);
            }

            try
            {
                // 如果不匹配则跳过,不影响其它数据流处理器的执行
                if (Required != null && !Required(context.Response.Request))
                {
                    return(DataFlowResult.Success);
                }

                if (context.Selectable == null)
                {
                    context.Selectable = SelectableBuilder != null
                                                ? SelectableBuilder(context)
                                                : context.Response?.ToSelectable();
                }

                var dataFlowResult = await Parse(context);

                var requests = FollowRequestQuerier == null ? new List <Request>(0) : FollowRequestQuerier(context);

                if (requests != null && requests.Count > 0)
                {
                    foreach (var request in requests)
                    {
                        if (request != null && (Required == null || Required(request)))
                        {
                            // 在此强制设制 OwnerId, 防止用户忘记导致出错
                            if (string.IsNullOrWhiteSpace(request.OwnerId))
                            {
                                request.OwnerId = context.Response.Request.OwnerId;
                                request.AgentId = context.Response.Request.AgentId;
                            }

                            context.AddExtraRequests(request);
                        }
                    }
                }

                if (dataFlowResult == DataFlowResult.Failed || dataFlowResult == DataFlowResult.Terminated)
                {
                    return(dataFlowResult);
                }

                return(DataFlowResult.Success);
            }
            catch (Exception e)
            {
                Logger?.LogError($"任务 {context.Response.Request.OwnerId} 数据解析发生异常: {e}");
                return(DataFlowResult.Failed);
            }
        }
Пример #6
0
        protected override Task <DataFlowResult> Parse(DataFlowContext context)
        {
            Console.WriteLine(context.Response.Request.Url);
            Dictionary <string, string> tags = new Dictionary <string, string>();
            var tagNodes = context.Selectable.Regex("Next(.+)").Nodes();

            foreach (var node in tagNodes)
            {
                //找到页面函数里面有总页数和当前页面ID
                var el     = node.GetValue().Replace("Next(", "").Replace("\\", "").Replace("\"", "");
                var elArry = el.Split(',');
                int.TryParse(elArry[1], out int pages);
                if (pages > 1 && elArry[0] == "1")
                {
                    var requests = new List <Request>();
                    for (int i = 2; i <= pages; i++)
                    {
                        var request = new Request()
                        {
                            OwnerId = context.Response.Request.OwnerId, Url = context.Response.Request.Url.Replace(".htm", $"_{i}.htm")
                        };
                        request.AddProperty("tag", context.Selectable.XPath(".//title").GetValue());
                        requests.Add(request);
                    }
                    Console.WriteLine($"{context.Response.Request.Url}\t{pages}\t{requests.Count}");
                    count += pages;
                    context.AddExtraRequests(requests.ToArray());;
                }
            }
            var imgNodes = context.Selectable.XPath(".//div[@id='ArticleId0']//p//a//img").Nodes();

            foreach (var nodes in imgNodes)
            {
                var url     = nodes.XPath("@src").GetValue();
                var newNode = (nodes as Selectable).Elements.FirstOrDefault();
                var alt     = new Selectable(newNode.OuterHtml.Replace("alt=\"\"", ""));
                var name    = alt.XPath("//img/@alt").GetValue();
                var request = new Request()
                {
                    Url = url, OwnerId = context.Response.Request.OwnerId
                };
                request.AddProperty("tag", context.Selectable.XPath(".//div[@class='position gray']//div[1]//a[2]").GetValue());
                request.AddProperty("referer", context.Response.Request.GetProperty("referer") ?? url);
                request.AddProperty("subject", name);
                ImageDownloader.GetInstance().AddRequest(request);
            }
            return(Task.FromResult(DataFlowResult.Success));
        }
Пример #7
0
        /// <summary>
        /// 获取主题的地址
        /// </summary>
        /// <param name="context"></param>
        public static void GetSubjectUrl(DataFlowContext context)
        {
            var pages = context.Selectable
                        .XPath("//*[@id=\"listdiv\"]/ul/li/div[@class='galleryli_title']/a/@href").GetValues();
            var requestList = new List <Request>();

            foreach (var page in pages)
            {
                var request = new Request
                {
                    Url     = page,
                    OwnerId = context.Response.Request.OwnerId
                };
                request.AddProperty("tag", context.Response.Request.GetProperty("tag"));
                request.AddProperty("referer", context.Response.Request.Url);
                requestList.Add(request);
            }

            if (requestList.Count > 0)
            {
                context.AddExtraRequests(requestList.ToArray());
            }
        }
Пример #8
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var news = context.Selectable.XPath(".//div[@class='news_block']").Nodes();

                foreach (var item in news)
                {
                    var title = item.Select(Selectors.XPath(".//h2[@class='news_entry']"))
                                .GetValue(ValueOption.InnerText);
                    var url     = item.Select(Selectors.XPath(".//h2[@class='news_entry']/a/@href")).GetValue();
                    var summary = item.Select(Selectors.XPath(".//div[@class='entry_summary']"))
                                  .GetValue(ValueOption.InnerText);
                    var views = item.Select(Selectors.XPath(".//span[@class='view']")).GetValue(ValueOption.InnerText)
                                .Replace(" 人浏览", "");
                    var request = CreateFromRequest(context.Response.Request, url);
                    request.AddProperty("title", title);
                    request.AddProperty("summary", summary);
                    request.AddProperty("views", views);

                    context.AddExtraRequests(request);
                }

                return(Task.FromResult(DataFlowResult.Success));
            }
Пример #9
0
        /// <summary>
        /// 解析画册的分页
        /// </summary>
        /// <param name="context"></param>
        public static void GetSubjectPageUrl(DataFlowContext context)
        {
            var pageSet = new Dictionary <string, string>();
            var pages   = context.Selectable
                          .XPath("//*[@id=\"listdiv\"]/div[@class='pagesYY']/div/a[not(@class)]/@href").GetValues();
            var requestList = new List <Request>();

            foreach (var page in pages)
            {
                if (!pageSet.ContainsKey(page))
                {
                    try
                    {
                        var request = new Request
                        {
                            Url     = page,
                            OwnerId = context.Response.Request.OwnerId
                        };
                        //request.Properties.Add("tag", response.Request.Properties["tag"]);
                        request.AddProperty("tag", "萝莉");
                        requestList.Add(request);

                        pageSet.Add(page, page);
                    }
                    catch (Exception e)
                    {
                        Console.WriteLine(e);
                    }
                }
            }

            if (requestList.Count > 0)
            {
                context.AddExtraRequests(requestList.ToArray());
            }
        }