protected void Extract(Page page) { var results = new ExtractResults(); foreach (var field in Config.Fields) { try { string source; switch (field.SourceType) { case SourceType.Page: source = page.Html; break; case SourceType.AttachedUrl: throw new NotImplementedException(); case SourceType.UrlContext: source = page.Request.Url; break; default: throw new ArgumentOutOfRangeException(); } Result result; switch (field.Selectortype) { case SelectorType.JsonPath: result = DoJson(source, field); break; case SelectorType.XPath: result = DoHtml(source, field); break; case SelectorType.Regex: result = DoRegex(source, field); break; default: throw new ArgumentOutOfRangeException(); } results.Add(result); } catch (Exception e) { Logger.Error($"{page.Request.Url} 抽取 {field.Selectortype} {field.Name} 失败 \r\n{e}"); FailCount++; return; } } page.Results.Add(results); //AfterExtractField?.Invoke(page, result); }
static void Main(string[] args) { var config = new Config { Name = "longzhu", ScanUrls = "http://api.plu.cn/tga/streams?max-results=50&start-index=0&sort-by=views&filter=0&game=0", Fields = new[] { new Field { Name = "title", Selectortype = SelectorType.JsonPath, Selector = "$.channel.status" }, new Field { Name = "username", Selectortype = SelectorType.JsonPath, Selector = "$.channel.name" }, new Field { Name = "online", Selectortype = SelectorType.JsonPath, Selector = "$.viewers", Type = FieldType.Int, }, new Field { Name = "fanscount", Selectortype = SelectorType.JsonPath, Selector = "$.channel.followers", Type = FieldType.Int, }, new Field { Name = "cate", Selector = "$.game[0].name", Selectortype = SelectorType.JsonPath } }, RepeatWhen = RepeatWhenEver.hour, RepeatAt = new TimeSpan(0, 25, 0), }; crawler = new Crawler(); crawler.Downloader.AfterDownloadPage = p => { }; crawler.Processor.OnCustomExtract = p => { var j = JObject.Parse(p.Html); var jr = JArray.FromObject(j["data"]["items"]); for (int i = 0; i < jr.Count; i++) { var exres = new ExtractResults(); var info = jr[i]; foreach (var f in config.Fields) { var res = new Result(f.Name, info.SelectToken(f.Selector).ToString()); exres.Add(res); } p.Results.Add(exres); } }; crawler.Processor.OnProcessScanPage = p => { var totalcount = p.GetJson("$.data.totalItems"); var pagecount = int.Parse(totalcount) / 50 + 1; for (int i = 1; i <= pagecount; i++) { crawler.Schduler.AddUrl($"http://api.plu.cn/tga/streams?max-results=200&start-index={i * 50}&sort-by=views&filter=0&game=0"); } }; crawler.Setup(config); crawler.Start(); Console.WriteLine("end"); Console.ReadKey(); }
static void Main(string[] args) { var config = new Config { Name = "chushou", ScanUrls = "https://chushou.tv/live/down-v2.htm", Fields = new[] { new Field { Name = "title", Selectortype = SelectorType.JsonPath, Selector = "$.name" }, new Field { Name = "username", Selectortype = SelectorType.JsonPath, Selector = "$.meta.creator" }, new Field { Name = "online", Selectortype = SelectorType.JsonPath, Selector = "$.meta.onlineCount", Type = FieldType.Int, }, new Field { Name = "fanscount", Selectortype = SelectorType.JsonPath, Selector = "$.meta.subscriberCount", Type = FieldType.Int, }, new Field { Name = "cate", Selector = "$.meta.gameName", Selectortype = SelectorType.JsonPath } }, RepeatWhen = RepeatWhenEver.hour, RepeatAt = new TimeSpan(0, 35, 0), }; crawler = new CrawlerDotNet.Core.Crawler(); string lastpoint = ""; crawler.Processor.OnProcessScanPage = p => { var point = p.GetJson("$.data.breakpoint"); crawler.Schduler.AddUrl("https://chushou.tv/live/down-v2.htm?&breakpoint=" + point, point != lastpoint ? PageType.ScanUrl : PageType.ContextUrl); lastpoint = point; }; crawler.Processor.OnCustomExtract = p => { var j = JObject.Parse(p.Html); var jr = JArray.FromObject(j["data"]["items"]); for (int i = 0; i < jr.Count; i++) { var exres = new ExtractResults(); var info = jr[i]; foreach (var f in config.Fields) { var res = new Result(f.Name, info.SelectToken(f.Selector).ToString()); exres.Add(res); } p.Results.Add(exres); } }; crawler.Setup(config); crawler.Start(); Console.WriteLine("end"); Console.ReadKey(); }
static void Main(string[] args) { var config = new Config { Name = "zhanqi", ScanUrls = "http://www.zhanqi.tv/api/static/v2.1/live/list/200/1.json", Fields = new[] { new Field { Name = "title", Selectortype = SelectorType.JsonPath, Selector = "$.title" }, new Field { Name = "username", Selectortype = SelectorType.JsonPath, Selector = "$.nickname" }, new Field { Name = "online", Selectortype = SelectorType.JsonPath, Selector = "$.online", Type = FieldType.Int, }, new Field { ////*[@id="js-room-anchor-info-area"]/div[2]/div[1]/div/span[1] Name = "fanscount", Selectortype = SelectorType.Regex, Selector = "js-room-follow-num\">([0-9]*)<", Type = FieldType.Int, }, new Field { Name = "cate", Selector = "$.newGameName", Selectortype = SelectorType.JsonPath }, new Field { Name = "childcate", Selector = "$.gameName", Selectortype = SelectorType.JsonPath } }, RepeatWhen = RepeatWhenEver.hour, RepeatAt = new TimeSpan(0, 10, 0), }; crawler = new CrawlerDotNet.Core.Crawler(); var curPage = 1; crawler.BeforeCrawl = () => { curPage = 1; }; crawler.Downloader.AfterDownloadPage = p => { //是不是有数据.有数据加入下一个json var rooms = p.GetJson("$.data.rooms"); if (rooms != "[]") { curPage++; crawler.Schduler.AddUrl($"http://www.zhanqi.tv/api/static/v2.1/live/list/200/{curPage}.json"); } }; crawler.Processor.OnCustomExtract = p => { var j = JObject.Parse(p.Html); var jr = JArray.FromObject(j["data"]["rooms"]); for (int i = 0; i < jr.Count; i++) { var exres = new ExtractResults(); var info = jr[i]; foreach (var f in config.Fields) { if (f.Name == "fanscount") { //请求订阅 var fanspage = crawler.Downloader.DownloaderOnly(new Request(crawler.Schduler) { Url = "https://www.zhanqi.tv" + info.SelectToken("$.url").ToString() }); var r = BaseProcessor.DoRegex(fanspage.Html, f); if (r.Value == "") { r.Value = "0"; } exres.Add(r); continue; } var res = new Result(f.Name, info.SelectToken(f.Selector).ToString()); exres.Add(res); } p.Results.Add(exres); } }; crawler.Setup(config); crawler.Start(); Console.WriteLine("end"); Console.ReadKey(); }
static void Main(string[] args) { #region c var c = new Config { Name = "panda", ScanUrls = "https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno=1&pagenum=120", ContentUrlRegexes = new Regex("live_lists"), HelperUrlRegexes = new Regex("789987"), Fields = new[] { new Field { Name = "title", Selectortype = SelectorType.JsonPath, Selector = "$.name" }, new Field { Name = "username", Selectortype = SelectorType.JsonPath, Selector = "$.userinfo.nickName" }, new Field { Name = "online", Selectortype = SelectorType.JsonPath, Selector = "$.person_num", Type = FieldType.Int, }, new Field { Name = "fanscount", Selectortype = SelectorType.JsonPath, Selector = "$.data.fans", Type = FieldType.Int, }, new Field { Name = "cate", Selector = "$.classification.cname", Selectortype = SelectorType.JsonPath } }, RepeatAt = new TimeSpan(0, 30, 0), }; #endregion crawler = new Crawler(); ////https://www.panda.tv/room_followinfo?token=&roomid=1042806&_=1509522885105 //https://www.panda.tv/1042806 //https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno=3&pagenum=120&_=1509525309865 crawler.Processor.OnCustomExtract = p => { var j = JObject.Parse(p.Html); for (int i = 0; i < 120; i++) { var roominfo = j.SelectToken($"$.data.items[{i}]"); if (roominfo == null) { break; } var exres = new ExtractResults(); foreach (var f in c.Fields) { if (f.Name == "fanscount") { //请求订阅 var fanspage = crawler.Downloader.DownloaderOnly(new Request(crawler.Schduler) { Url = "https://www.panda.tv/room_followinfo?token=&roomid=" + roominfo.SelectToken("$.id").ToString() }); var r = BaseProcessor.DoJson(fanspage.Html, f); exres.Add(r); continue; } var res = new Result(f.Name, roominfo.SelectToken(f.Selector).ToString()); exres.Add(res); } p.Results.Add(exres); } }; crawler.Processor.OnProcessScanPage = p => { //*[@id="pages-container"]/div/div/a[7] var total = int.Parse(p.GetJson("$.data.total")); var pageconut = total / 120 + (total % 120 > 0 ? 1 : 0); #if DEBUG pageconut = 1; #endif for (int i = 1; i <= pageconut; i++) { crawler.Schduler.AddUrl($"https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno={i}&pagenum=120"); } }; crawler.Setup(c); crawler.Start(); Console.ReadLine(); }