Beispiel #1
0
            public override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var response = context.GetResponse();

                //context.AddItem("URL", response.Request.Url);
                //context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

                //var result = CheckType(response.Request.Url);
                Console.ForegroundColor = ConsoleColor.Yellow;
                Console.WriteLine("page 页:" + context.GetSelectable().XPath(".//title").GetValue());
                Console.ForegroundColor = ConsoleColor.White;
                //var subs = context.GetSelectable().XPath("*[@id='post_rank']/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues();
                //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[2]/ul/li/a[1]/@href").GetValue();
                //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes().Count();

                /*var pages = context.GetSelectable().XPath("//*[@id=\"listdiv\"]/div[3]/div/a[not(@class)]/@href").GetValues();
                 * foreach (var page in pages)
                 * {
                 *  var request = new Request();
                 *  request.Url = page;
                 *  request.OwnerId = response.Request.OwnerId;
                 *  context.AddTargetRequests(request);
                 * }*/

                return(Task.FromResult(DataFlowResult.Success));
            }
Beispiel #2
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var response = context.GetResponse();

                context.AddItem("URL", response.Request.Url);
                context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

                Dictionary <string, string> tags = new Dictionary <string, string>();
                var tagNodes = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes();

                foreach (var node in tagNodes)
                {
                    var url  = node.XPath("./@href").GetValue();
                    var name = node.GetValue();
                    tags.Add(url, name);
                    Console.WriteLine("url:" + url + " - name:" + name);
                }

                var requests = new List <Request>();

                foreach (var tag in tags)
                {
                    var request = new Request
                    {
                        Url     = tag.Key,
                        OwnerId = response.Request.OwnerId,
                    };
                    request.Properties.Add("tag", tag.Value);

                    requests.Add(request);
                }
                context.AddTargetRequests(requests.ToArray());

                return(Task.FromResult(DataFlowResult.Success));
            }
Beispiel #3
0
        /// <summary>
        /// 取得详细图片查看的分页url
        /// </summary>
        /// <param name="context"></param>
        public static void GetDetailPageUrl(DataFlowContext context)
        {
            var response = context.GetResponse();
            Dictionary <string, string> pageSet = new Dictionary <string, string>();
            var pages       = context.GetSelectable().XPath("//*[@id=\"pages\"]/a[not(@class)]/@href").GetValues();
            var requestList = new List <Request>();

            foreach (var page in pages)
            {
                if (!pageSet.ContainsKey(page))
                {
                    var request = new Request
                    {
                        Url     = page,
                        OwnerId = response.Request.OwnerId,
                    };
                    request.AddProperty("tag", response.Request.GetProperty("tag"));
                    request.AddProperty("referer", response.Request.GetProperty("referer"));
                    requestList.Add(request);

                    pageSet.Add(page, page);
                }
            }

            if (requestList.Count > 0)
            {
                context.AddTargetRequests(requestList.ToArray());
            }
        }
Beispiel #4
0
            public override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var response = context.GetResponse();

                //context.AddItem("URL", response.Request.Url);
                //context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

                Console.ForegroundColor = ConsoleColor.Blue;
                Console.WriteLine("第一页:" + context.GetSelectable().XPath(".//title").GetValue());
                Console.ForegroundColor = ConsoleColor.White;
                //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes().Count();
                Dictionary <string, string> pageSet = new Dictionary <string, string>();
                var pages       = context.GetSelectable().XPath("//*[@id=\"listdiv\"]/div[@class='pagesYY']/div/a[not(@class)]/@href").GetValues();
                var requestList = new List <Request>();

                foreach (var page in pages)
                {
                    if (!pageSet.ContainsKey(page))
                    {
                        var request = new Request();
                        request.Url     = page;
                        request.OwnerId = response.Request.OwnerId;
                        requestList.Add(request);

                        pageSet.Add(page, page);
                    }
                }

                if (requestList.Count > 0)
                {
                    context.AddTargetRequests(requestList.ToArray());
                }

                return(Task.FromResult(DataFlowResult.Success));
            }
Beispiel #5
0
            protected override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var response = context.GetResponse();

                context.AddItem("URL", response.Request.Url);
                context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());
                return(Task.FromResult(DataFlowResult.Success));
            }
Beispiel #6
0
        protected override async Task <DataFlowResult> Store(DataFlowContext context)
        {
            var response = context.GetResponse();

            var file = Path.Combine(GetDataFolder(response.Request.OwnerId), $"{response.Request.Hash}.json");

            CreateFile(file);
            var items = context.GetItems();
            await Writer.WriteLineAsync(JsonConvert.SerializeObject(items));

            return(DataFlowResult.Success);
        }
Beispiel #7
0
        protected override Task <DataFlowResult> Parse(DataFlowContext context)
        {
            var response = context.GetResponse();

            if (response != null)
            {
                context.AddItem("URL", response.Request.Url);
                context.AddItem("Content", response.RawText);
                context.AddItem("TargetUrl", response.TargetUrl);
                context.AddItem("Success", response.Success);
                context.AddItem("ElapsedMilliseconds", response.ElapsedMilliseconds);
            }

            return(Task.FromResult(DataFlowResult.Success));
        }
Beispiel #8
0
            public override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var response = context.GetResponse();

                context.AddItem("URL", response.Request.Url);
                context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

                Dictionary <string, string> tags = new Dictionary <string, string>();
                var tagNodes = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes();

                foreach (var node in tagNodes)
                {
                    var url  = node.XPath("./@href").GetValue();
                    var name = node.GetValue();
                    tags.Add(url, name);
                    Console.WriteLine("url:" + url + " - name:" + name);
                }

                var requests = new List <Request>();

                foreach (var sub in tags)
                {
                    var request = new Request
                    {
                        Url     = sub.Key,
                        OwnerId = response.Request.OwnerId
                    };
                    requests.Add(request);

                    CreateDirByTag(sub.Value);
                }
                context.AddTargetRequests(requests.ToArray());

                /*var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues();
                 * var requests = new List<Request>();
                 * foreach (var sub in subs)
                 * {
                 *  var request = new Request();
                 *  request.Url = sub;
                 *  request.OwnerId = response.Request.OwnerId;
                 *  requests.Add(request);
                 *  Console.WriteLine("sub parse:" + sub);
                 * }
                 * context.AddTargetRequests(requests.ToArray());*/

                return(Task.FromResult(DataFlowResult.Success));
            }
Beispiel #9
0
        public override async Task <DataFlowResult> HandleAsync(DataFlowContext context)
        {
            try
            {
                var response = context.GetResponse();
                var request  = response.Request;
                // 如果不匹配则终止数据流程
                if (CanParse != null && !CanParse(request))
                {
                    return(DataFlowResult.Terminated);
                }

                Selectable?.Invoke(context);

                var parserResult = await Parse(context);

                if (parserResult == DataFlowResult.Failed || parserResult == DataFlowResult.Terminated)
                {
                    return(parserResult);
                }

                var urls = Follow?.Invoke(context);
                if (urls != null && urls.Length > 0)
                {
                    var followRequests = new List <Request>();
                    foreach (var url in urls)
                    {
                        var followRequest = CreateFromRequest(request, url);
                        if (CanParse(followRequest))
                        {
                            followRequests.Add(followRequest);
                        }
                    }

                    context.AddTargetRequests(followRequests.ToArray());
                }

                return(DataFlowResult.Success);
            }
            catch (Exception e)
            {
                Logger?.LogError($"数据解析发生异常: {e}");
                return(DataFlowResult.Failed);
            }
        }
Beispiel #10
0
            public override Task <DataFlowResult> Parse(DataFlowContext context)
            {
                var response = context.GetResponse();

                context.AddItem("URL", response.Request.Url);
                context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

                //var result = CheckType(response.Request.Url);
                //Console.WriteLine("type:"+result);

                //var subs = context.GetSelectable().XPath("*[@id='post_rank']/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues();
                //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[2]/ul/li/a[1]/@href").GetValue();
                //var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a").Nodes().Count();
                var subs = context.GetSelectable().XPath("//*[@id=\"post_rank\"]/div[2]/div/div[@class='tag_div']/ul/li/a/@href").GetValues();

                foreach (var sub in subs)
                {
                    CreateFromRequest(response.Request, sub);
                }

                return(Task.FromResult(DataFlowResult.Success));
            }
Beispiel #11
0
        /// <summary>
        /// 获取图片浏览页里抽图片地址
        /// </summary>
        /// <param name="context"></param>
        public static void GetDetailPictureUrl(DataFlowContext context)
        {
            var response = context.GetResponse();

            context.AddItem("URL", response.Request.Url);
            context.AddItem("Title", context.GetSelectable().XPath(".//title").GetValue());

            var images = context.GetSelectable().XPath("//*[@id=\"hgallery\"]/img/@src").GetValues();

            foreach (var image in images)
            {
                //处理图片URL下载
                var request = new Request
                {
                    Url     = image,
                    OwnerId = response.Request.OwnerId
                };
                request.AddProperty("tag", response.Request.GetProperty("tag"));
                request.AddProperty("referer", response.Request.GetProperty("referer"));
                request.AddProperty("subject", context.GetSelectable().XPath(".//title").GetValue());
                ImageDownloader.GetInstance().AddRequest(request);
            }
        }
Beispiel #12
0
        /// <summary>
        /// 获取主题的地址
        /// </summary>
        /// <param name="context"></param>
        public static void GetSubjectUrl(DataFlowContext context)
        {
            var response    = context.GetResponse();
            var pages       = context.GetSelectable().XPath("//*[@id=\"listdiv\"]/ul/li/div[@class='galleryli_title']/a/@href").GetValues();
            var requestList = new List <Request>();

            foreach (var page in pages)
            {
                var request = new Request
                {
                    Url     = page,
                    OwnerId = response.Request.OwnerId,
                };
                request.AddProperty("tag", response.Request.GetProperty("tag"));
                request.AddProperty("referer", response.Request.Url);
                requestList.Add(request);
            }

            if (requestList.Count > 0)
            {
                context.AddTargetRequests(requestList.ToArray());
            }
        }
Beispiel #13
0
        /// <summary>
        /// 解析画册的分页
        /// </summary>
        /// <param name="context"></param>
        public static void GetSubjectPageUrl(DataFlowContext context)
        {
            var response = context.GetResponse();
            Dictionary <string, string> pageSet = new Dictionary <string, string>();
            var pages       = context.GetSelectable().XPath("//*[@id=\"listdiv\"]/div[@class='pagesYY']/div/a[not(@class)]/@href").GetValues();
            var requestList = new List <Request>();

            foreach (var page in pages)
            {
                if (!pageSet.ContainsKey(page))
                {
                    try
                    {
                        var request = new Request
                        {
                            Url     = page,
                            OwnerId = response.Request.OwnerId,
                        };
                        //request.Properties.Add("tag", response.Request.Properties["tag"]);
                        request.AddProperty("tag", "萝莉");
                        requestList.Add(request);

                        pageSet.Add(page, page);
                    }
                    catch (Exception e)
                    {
                        Console.WriteLine(e);
                    }
                }
            }

            if (requestList.Count > 0)
            {
                context.AddTargetRequests(requestList.ToArray());
            }
        }
Beispiel #14
0
        protected override Task <DataFlowResult> Parse(DataFlowContext context)
        {
            if (!context.Contains(Model.TypeName))
            {
                context.Add(Model.TypeName, TableMetadata);
            }

            var            selectable = context.GetSelectable();
            List <dynamic> results    = new List <dynamic>();

            if (selectable.Properties == null)
            {
                selectable.Properties = new Dictionary <string, object>();
            }

            var environments = new Dictionary <string, string>();

            foreach (var property in context.GetResponse().Request.Properties)
            {
                environments.Add(property.Key, property.Value);
            }

            if (Model.ShareValueSelectors != null)
            {
                foreach (var selector in Model.ShareValueSelectors)
                {
                    string name  = selector.Name;
                    var    value = selectable.Select(selector.ToSelector()).GetValue();
                    if (!environments.ContainsKey(name))
                    {
                        environments.Add(name, value);
                    }
                    else
                    {
                        environments[name] = value;
                    }
                }
            }

            bool singleExtractor = Model.Selector == null;

            if (!singleExtractor)
            {
                var selector = Model.Selector.ToSelector();

                var list = selectable.SelectList(selector).Nodes()?.ToList();
                if (list != null)
                {
                    if (Model.Take > 0 && list.Count > Model.Take)
                    {
                        list = Model.TakeFromHead
                            ? list.Take(Model.Take).ToList()
                            : list.Skip(list.Count - Model.Take).ToList();
                    }

                    for (var i = 0; i < list.Count; ++i)
                    {
                        var item = list.ElementAt(i);
                        var obj  = ParseObject(environments, item, i);
                        if (obj != null)
                        {
                            results.Add(obj);
                        }
                        else
                        {
                            Logger?.LogWarning($"解析到空数据,类型: {Model.TypeName}");
                        }
                    }
                }
            }
            else
            {
                var obj = ParseObject(environments, selectable, 0);
                if (obj != null)
                {
                    results.Add(obj);
                }
                else
                {
                    Logger?.LogWarning($"解析到空数据,类型: {Model.TypeName}");
                }
            }

            if (results.Count > 0)
            {
                var items = context.GetItem(Model.TypeName);
                if (items == null)
                {
                    context.AddItem(Model.TypeName, results);
                }
                else
                {
                    items.AddRange(results);
                }
            }

            return(Task.FromResult(DataFlowResult.Success));
        }