Exemplo n.º 1
0
        protected void Extract(Page page)
        {
            var results = new ExtractResults();

            foreach (var field in Config.Fields)
            {
                try
                {
                    string source;
                    switch (field.SourceType)
                    {
                    case SourceType.Page:
                        source = page.Html;
                        break;

                    case SourceType.AttachedUrl:
                        throw new NotImplementedException();

                    case SourceType.UrlContext:
                        source = page.Request.Url;
                        break;

                    default:
                        throw new ArgumentOutOfRangeException();
                    }

                    Result result;
                    switch (field.Selectortype)
                    {
                    case SelectorType.JsonPath:
                        result = DoJson(source, field);
                        break;

                    case SelectorType.XPath:
                        result = DoHtml(source, field);
                        break;

                    case SelectorType.Regex:
                        result = DoRegex(source, field);
                        break;

                    default:
                        throw new ArgumentOutOfRangeException();
                    }

                    results.Add(result);
                }
                catch (Exception e)
                {
                    Logger.Error($"{page.Request.Url} 抽取 {field.Selectortype} {field.Name} 失败 \r\n{e}");
                    FailCount++;
                    return;
                }
            }

            page.Results.Add(results);
            //AfterExtractField?.Invoke(page, result);
        }
Exemplo n.º 2
0
        static void Main(string[] args)
        {
            var config = new Config
            {
                Name     = "longzhu",
                ScanUrls = "http://api.plu.cn/tga/streams?max-results=50&start-index=0&sort-by=views&filter=0&game=0",
                Fields   = new[]
                {
                    new Field
                    {
                        Name         = "title",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.channel.status"
                    },
                    new Field
                    {
                        Name         = "username",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.channel.name"
                    },
                    new Field
                    {
                        Name         = "online",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.viewers",
                        Type         = FieldType.Int,
                    },
                    new Field
                    {
                        Name         = "fanscount",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.channel.followers",
                        Type         = FieldType.Int,
                    },
                    new Field
                    {
                        Name         = "cate",
                        Selector     = "$.game[0].name",
                        Selectortype = SelectorType.JsonPath
                    }
                },
                RepeatWhen = RepeatWhenEver.hour,
                RepeatAt   = new TimeSpan(0, 25, 0),
            };

            crawler = new Crawler();



            crawler.Downloader.AfterDownloadPage = p =>
            {
            };
            crawler.Processor.OnCustomExtract = p =>
            {
                var j  = JObject.Parse(p.Html);
                var jr = JArray.FromObject(j["data"]["items"]);


                for (int i = 0; i < jr.Count; i++)
                {
                    var exres = new ExtractResults();
                    var info  = jr[i];
                    foreach (var f in config.Fields)
                    {
                        var res = new Result(f.Name, info.SelectToken(f.Selector).ToString());
                        exres.Add(res);
                    }
                    p.Results.Add(exres);
                }
            };
            crawler.Processor.OnProcessScanPage = p =>
            {
                var totalcount = p.GetJson("$.data.totalItems");
                var pagecount  = int.Parse(totalcount) / 50 + 1;

                for (int i = 1; i <= pagecount; i++)
                {
                    crawler.Schduler.AddUrl($"http://api.plu.cn/tga/streams?max-results=200&start-index={i * 50}&sort-by=views&filter=0&game=0");
                }
            };
            crawler.Setup(config);
            crawler.Start();
            Console.WriteLine("end");
            Console.ReadKey();
        }
Exemplo n.º 3
0
        static void Main(string[] args)
        {
            var config = new Config
            {
                Name     = "chushou",
                ScanUrls = "https://chushou.tv/live/down-v2.htm",
                Fields   = new[]
                {
                    new Field
                    {
                        Name         = "title",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.name"
                    },
                    new Field
                    {
                        Name         = "username",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.meta.creator"
                    },
                    new Field
                    {
                        Name         = "online",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.meta.onlineCount",
                        Type         = FieldType.Int,
                    },
                    new Field
                    {
                        Name         = "fanscount",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.meta.subscriberCount",
                        Type         = FieldType.Int,
                    },
                    new Field
                    {
                        Name         = "cate",
                        Selector     = "$.meta.gameName",
                        Selectortype = SelectorType.JsonPath
                    }
                },
                RepeatWhen = RepeatWhenEver.hour,
                RepeatAt   = new TimeSpan(0, 35, 0),
            };

            crawler = new CrawlerDotNet.Core.Crawler();
            string lastpoint = "";

            crawler.Processor.OnProcessScanPage = p =>
            {
                var point = p.GetJson("$.data.breakpoint");
                crawler.Schduler.AddUrl("https://chushou.tv/live/down-v2.htm?&breakpoint=" + point, point != lastpoint ? PageType.ScanUrl : PageType.ContextUrl);
                lastpoint = point;
            };
            crawler.Processor.OnCustomExtract = p =>
            {
                var j  = JObject.Parse(p.Html);
                var jr = JArray.FromObject(j["data"]["items"]);


                for (int i = 0; i < jr.Count; i++)
                {
                    var exres = new ExtractResults();
                    var info  = jr[i];
                    foreach (var f in config.Fields)
                    {
                        var res = new Result(f.Name, info.SelectToken(f.Selector).ToString());
                        exres.Add(res);
                    }
                    p.Results.Add(exres);
                }
            };
            crawler.Setup(config);
            crawler.Start();
            Console.WriteLine("end");
            Console.ReadKey();
        }
Exemplo n.º 4
0
        static void Main(string[] args)
        {
            var config = new Config
            {
                Name     = "zhanqi",
                ScanUrls = "http://www.zhanqi.tv/api/static/v2.1/live/list/200/1.json",
                Fields   = new[]
                {
                    new Field
                    {
                        Name         = "title",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.title"
                    },
                    new Field
                    {
                        Name         = "username",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.nickname"
                    },
                    new Field
                    {
                        Name         = "online",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.online",
                        Type         = FieldType.Int,
                    },
                    new Field
                    {
                        ////*[@id="js-room-anchor-info-area"]/div[2]/div[1]/div/span[1]
                        Name         = "fanscount",
                        Selectortype = SelectorType.Regex,
                        Selector     = "js-room-follow-num\">([0-9]*)<",
                        Type         = FieldType.Int,
                    },
                    new Field
                    {
                        Name         = "cate",
                        Selector     = "$.newGameName",
                        Selectortype = SelectorType.JsonPath
                    },
                    new Field
                    {
                        Name         = "childcate",
                        Selector     = "$.gameName",
                        Selectortype = SelectorType.JsonPath
                    }
                },
                RepeatWhen = RepeatWhenEver.hour,
                RepeatAt   = new TimeSpan(0, 10, 0),
            };

            crawler = new CrawlerDotNet.Core.Crawler();

            var curPage = 1;

            crawler.BeforeCrawl = () =>
            {
                curPage = 1;
            };

            crawler.Downloader.AfterDownloadPage = p =>
            {
                //是不是有数据.有数据加入下一个json

                var rooms = p.GetJson("$.data.rooms");

                if (rooms != "[]")
                {
                    curPage++;
                    crawler.Schduler.AddUrl($"http://www.zhanqi.tv/api/static/v2.1/live/list/200/{curPage}.json");
                }
            };
            crawler.Processor.OnCustomExtract = p =>
            {
                var j  = JObject.Parse(p.Html);
                var jr = JArray.FromObject(j["data"]["rooms"]);


                for (int i = 0; i < jr.Count; i++)
                {
                    var exres = new ExtractResults();
                    var info  = jr[i];
                    foreach (var f in config.Fields)
                    {
                        if (f.Name == "fanscount")
                        {
                            //请求订阅


                            var fanspage = crawler.Downloader.DownloaderOnly(new Request(crawler.Schduler)
                            {
                                Url =
                                    "https://www.zhanqi.tv" +
                                    info.SelectToken("$.url").ToString()
                            });
                            var r = BaseProcessor.DoRegex(fanspage.Html, f);
                            if (r.Value == "")
                            {
                                r.Value = "0";
                            }
                            exres.Add(r);

                            continue;
                        }



                        var res = new Result(f.Name, info.SelectToken(f.Selector).ToString());
                        exres.Add(res);
                    }
                    p.Results.Add(exres);
                }
            };
            crawler.Setup(config);
            crawler.Start();

            Console.WriteLine("end");
            Console.ReadKey();
        }
Exemplo n.º 5
0
        static void Main(string[] args)
        {
            #region c
            var c = new Config
            {
                Name     = "panda",
                ScanUrls = "https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno=1&pagenum=120",

                ContentUrlRegexes = new Regex("live_lists"),
                HelperUrlRegexes  = new Regex("789987"),

                Fields = new[]
                {
                    new Field
                    {
                        Name         = "title",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.name"
                    }, new Field
                    {
                        Name         = "username",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.userinfo.nickName"
                    }, new Field
                    {
                        Name         = "online",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.person_num",
                        Type         = FieldType.Int,
                    }, new Field
                    {
                        Name         = "fanscount",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.data.fans",
                        Type         = FieldType.Int,
                    }, new Field
                    {
                        Name         = "cate",
                        Selector     = "$.classification.cname",
                        Selectortype = SelectorType.JsonPath
                    }
                },
                RepeatAt = new TimeSpan(0, 30, 0),
            };
            #endregion
            crawler = new Crawler();
            ////https://www.panda.tv/room_followinfo?token=&roomid=1042806&_=1509522885105
            //https://www.panda.tv/1042806
            //https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno=3&pagenum=120&_=1509525309865

            crawler.Processor.OnCustomExtract = p =>
            {
                var j = JObject.Parse(p.Html);

                for (int i = 0; i < 120; i++)
                {
                    var roominfo = j.SelectToken($"$.data.items[{i}]");
                    if (roominfo == null)
                    {
                        break;
                    }
                    var exres = new ExtractResults();

                    foreach (var f in c.Fields)
                    {
                        if (f.Name == "fanscount")
                        {
                            //请求订阅


                            var fanspage = crawler.Downloader.DownloaderOnly(new Request(crawler.Schduler)
                            {
                                Url =
                                    "https://www.panda.tv/room_followinfo?token=&roomid=" +
                                    roominfo.SelectToken("$.id").ToString()
                            });
                            var r = BaseProcessor.DoJson(fanspage.Html, f);
                            exres.Add(r);

                            continue;
                        }


                        var res = new Result(f.Name, roominfo.SelectToken(f.Selector).ToString());
                        exres.Add(res);
                    }



                    p.Results.Add(exres);
                }
            };

            crawler.Processor.OnProcessScanPage = p =>
            {
                //*[@id="pages-container"]/div/div/a[7]
                var total = int.Parse(p.GetJson("$.data.total"));

                var pageconut = total / 120 + (total % 120 > 0 ? 1 : 0);

#if DEBUG
                pageconut = 1;
#endif

                for (int i = 1; i <= pageconut; i++)
                {
                    crawler.Schduler.AddUrl($"https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno={i}&pagenum=120");
                }
            };
            crawler.Setup(c);
            crawler.Start();

            Console.ReadLine();
        }