public override void OnHandel(ExtractResults results) { Logger.Info("开始保存"); var doc = new BsonDocument(); foreach (var r in results) { if (!r.Skip) { doc.Add(r.Key, r.Value); } } Logger.Info("保存数量:" + doc.ElementCount); try { if (doc.ElementCount > 0) { _collection.InsertOne(doc); } } catch (Exception e) { Logger.Error(e, "保存出现问题"); } Logger.Info("保存结束"); }
public override void OnHandel(ExtractResults results) { var con = new MySqlConnection(Crawler.MysqlConfig.ConString); con.Open(); con.ChangeDatabase(DatabaseName); var cmd = new MySqlCommand { Connection = con }; //建立文本 var keys = "timestamp,cname"; var values = $"\"{ results.Timestamp}\",\"{Config.Name}\""; foreach (var result in results) { keys += $",{result.Key}"; values += $",\"{result.Value}\""; } try { cmd.CommandText = $"INSERT INTO {DataTableName}({keys}) VALUES({values})"; cmd.ExecuteNonQuery(); } catch (Exception e) { Logger.Error("sql保存错误:" + e.Message); } con.Close(); }
protected void Extract(Page page) { var results = new ExtractResults(); foreach (var field in Config.Fields) { try { string source; switch (field.SourceType) { case SourceType.Page: source = page.Html; break; case SourceType.AttachedUrl: throw new NotImplementedException(); case SourceType.UrlContext: source = page.Request.Url; break; default: throw new ArgumentOutOfRangeException(); } Result result; switch (field.Selectortype) { case SelectorType.JsonPath: result = DoJson(source, field); break; case SelectorType.XPath: result = DoHtml(source, field); break; case SelectorType.Regex: result = DoRegex(source, field); break; default: throw new ArgumentOutOfRangeException(); } results.Add(result); } catch (Exception e) { Logger.Error($"{page.Request.Url} 抽取 {field.Selectortype} {field.Name} 失败 \r\n{e}"); FailCount++; return; } } page.Results.Add(results); //AfterExtractField?.Invoke(page, result); }
static void Main(string[] args) { var config = new Config { Name = "longzhu", ScanUrls = "http://api.plu.cn/tga/streams?max-results=50&start-index=0&sort-by=views&filter=0&game=0", Fields = new[] { new Field { Name = "title", Selectortype = SelectorType.JsonPath, Selector = "$.channel.status" }, new Field { Name = "username", Selectortype = SelectorType.JsonPath, Selector = "$.channel.name" }, new Field { Name = "online", Selectortype = SelectorType.JsonPath, Selector = "$.viewers", Type = FieldType.Int, }, new Field { Name = "fanscount", Selectortype = SelectorType.JsonPath, Selector = "$.channel.followers", Type = FieldType.Int, }, new Field { Name = "cate", Selector = "$.game[0].name", Selectortype = SelectorType.JsonPath } }, RepeatWhen = RepeatWhenEver.hour, RepeatAt = new TimeSpan(0, 25, 0), }; crawler = new Crawler(); crawler.Downloader.AfterDownloadPage = p => { }; crawler.Processor.OnCustomExtract = p => { var j = JObject.Parse(p.Html); var jr = JArray.FromObject(j["data"]["items"]); for (int i = 0; i < jr.Count; i++) { var exres = new ExtractResults(); var info = jr[i]; foreach (var f in config.Fields) { var res = new Result(f.Name, info.SelectToken(f.Selector).ToString()); exres.Add(res); } p.Results.Add(exres); } }; crawler.Processor.OnProcessScanPage = p => { var totalcount = p.GetJson("$.data.totalItems"); var pagecount = int.Parse(totalcount) / 50 + 1; for (int i = 1; i <= pagecount; i++) { crawler.Schduler.AddUrl($"http://api.plu.cn/tga/streams?max-results=200&start-index={i * 50}&sort-by=views&filter=0&game=0"); } }; crawler.Setup(config); crawler.Start(); Console.WriteLine("end"); Console.ReadKey(); }
static void Main(string[] args) { var config = new Config { Name = "chushou", ScanUrls = "https://chushou.tv/live/down-v2.htm", Fields = new[] { new Field { Name = "title", Selectortype = SelectorType.JsonPath, Selector = "$.name" }, new Field { Name = "username", Selectortype = SelectorType.JsonPath, Selector = "$.meta.creator" }, new Field { Name = "online", Selectortype = SelectorType.JsonPath, Selector = "$.meta.onlineCount", Type = FieldType.Int, }, new Field { Name = "fanscount", Selectortype = SelectorType.JsonPath, Selector = "$.meta.subscriberCount", Type = FieldType.Int, }, new Field { Name = "cate", Selector = "$.meta.gameName", Selectortype = SelectorType.JsonPath } }, RepeatWhen = RepeatWhenEver.hour, RepeatAt = new TimeSpan(0, 35, 0), }; crawler = new CrawlerDotNet.Core.Crawler(); string lastpoint = ""; crawler.Processor.OnProcessScanPage = p => { var point = p.GetJson("$.data.breakpoint"); crawler.Schduler.AddUrl("https://chushou.tv/live/down-v2.htm?&breakpoint=" + point, point != lastpoint ? PageType.ScanUrl : PageType.ContextUrl); lastpoint = point; }; crawler.Processor.OnCustomExtract = p => { var j = JObject.Parse(p.Html); var jr = JArray.FromObject(j["data"]["items"]); for (int i = 0; i < jr.Count; i++) { var exres = new ExtractResults(); var info = jr[i]; foreach (var f in config.Fields) { var res = new Result(f.Name, info.SelectToken(f.Selector).ToString()); exres.Add(res); } p.Results.Add(exres); } }; crawler.Setup(config); crawler.Start(); Console.WriteLine("end"); Console.ReadKey(); }
public virtual void OnHandel(ExtractResults results) { }
static void Main(string[] args) { var config = new Config { Name = "zhanqi", ScanUrls = "http://www.zhanqi.tv/api/static/v2.1/live/list/200/1.json", Fields = new[] { new Field { Name = "title", Selectortype = SelectorType.JsonPath, Selector = "$.title" }, new Field { Name = "username", Selectortype = SelectorType.JsonPath, Selector = "$.nickname" }, new Field { Name = "online", Selectortype = SelectorType.JsonPath, Selector = "$.online", Type = FieldType.Int, }, new Field { ////*[@id="js-room-anchor-info-area"]/div[2]/div[1]/div/span[1] Name = "fanscount", Selectortype = SelectorType.Regex, Selector = "js-room-follow-num\">([0-9]*)<", Type = FieldType.Int, }, new Field { Name = "cate", Selector = "$.newGameName", Selectortype = SelectorType.JsonPath }, new Field { Name = "childcate", Selector = "$.gameName", Selectortype = SelectorType.JsonPath } }, RepeatWhen = RepeatWhenEver.hour, RepeatAt = new TimeSpan(0, 10, 0), }; crawler = new CrawlerDotNet.Core.Crawler(); var curPage = 1; crawler.BeforeCrawl = () => { curPage = 1; }; crawler.Downloader.AfterDownloadPage = p => { //是不是有数据.有数据加入下一个json var rooms = p.GetJson("$.data.rooms"); if (rooms != "[]") { curPage++; crawler.Schduler.AddUrl($"http://www.zhanqi.tv/api/static/v2.1/live/list/200/{curPage}.json"); } }; crawler.Processor.OnCustomExtract = p => { var j = JObject.Parse(p.Html); var jr = JArray.FromObject(j["data"]["rooms"]); for (int i = 0; i < jr.Count; i++) { var exres = new ExtractResults(); var info = jr[i]; foreach (var f in config.Fields) { if (f.Name == "fanscount") { //请求订阅 var fanspage = crawler.Downloader.DownloaderOnly(new Request(crawler.Schduler) { Url = "https://www.zhanqi.tv" + info.SelectToken("$.url").ToString() }); var r = BaseProcessor.DoRegex(fanspage.Html, f); if (r.Value == "") { r.Value = "0"; } exres.Add(r); continue; } var res = new Result(f.Name, info.SelectToken(f.Selector).ToString()); exres.Add(res); } p.Results.Add(exres); } }; crawler.Setup(config); crawler.Start(); Console.WriteLine("end"); Console.ReadKey(); }
static void Main(string[] args) { #region c var c = new Config { Name = "panda", ScanUrls = "https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno=1&pagenum=120", ContentUrlRegexes = new Regex("live_lists"), HelperUrlRegexes = new Regex("789987"), Fields = new[] { new Field { Name = "title", Selectortype = SelectorType.JsonPath, Selector = "$.name" }, new Field { Name = "username", Selectortype = SelectorType.JsonPath, Selector = "$.userinfo.nickName" }, new Field { Name = "online", Selectortype = SelectorType.JsonPath, Selector = "$.person_num", Type = FieldType.Int, }, new Field { Name = "fanscount", Selectortype = SelectorType.JsonPath, Selector = "$.data.fans", Type = FieldType.Int, }, new Field { Name = "cate", Selector = "$.classification.cname", Selectortype = SelectorType.JsonPath } }, RepeatAt = new TimeSpan(0, 30, 0), }; #endregion crawler = new Crawler(); ////https://www.panda.tv/room_followinfo?token=&roomid=1042806&_=1509522885105 //https://www.panda.tv/1042806 //https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno=3&pagenum=120&_=1509525309865 crawler.Processor.OnCustomExtract = p => { var j = JObject.Parse(p.Html); for (int i = 0; i < 120; i++) { var roominfo = j.SelectToken($"$.data.items[{i}]"); if (roominfo == null) { break; } var exres = new ExtractResults(); foreach (var f in c.Fields) { if (f.Name == "fanscount") { //请求订阅 var fanspage = crawler.Downloader.DownloaderOnly(new Request(crawler.Schduler) { Url = "https://www.panda.tv/room_followinfo?token=&roomid=" + roominfo.SelectToken("$.id").ToString() }); var r = BaseProcessor.DoJson(fanspage.Html, f); exres.Add(r); continue; } var res = new Result(f.Name, roominfo.SelectToken(f.Selector).ToString()); exres.Add(res); } p.Results.Add(exres); } }; crawler.Processor.OnProcessScanPage = p => { //*[@id="pages-container"]/div/div/a[7] var total = int.Parse(p.GetJson("$.data.total")); var pageconut = total / 120 + (total % 120 > 0 ? 1 : 0); #if DEBUG pageconut = 1; #endif for (int i = 1; i <= pageconut; i++) { crawler.Schduler.AddUrl($"https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno={i}&pagenum=120"); } }; crawler.Setup(c); crawler.Start(); Console.ReadLine(); }
public Bitmap ExtractPaperFromFlattened(Bitmap bitmap, Bitmap basicImage, int minBlobWidHei, int fillint, int contint) { // lock image, Bitmap itself takes much time to be processed BitmapData bitmapData = bitmap.LockBits( new Rectangle(0, 0, bitmap.Width, bitmap.Height), ImageLockMode.ReadWrite, bitmap.PixelFormat); // step 2 - locating objects BlobCounter blobCounter = new BlobCounter(); blobCounter.FilterBlobs = true; blobCounter.MinHeight = minBlobWidHei; // both these variables have to be given when calling the blobCounter.MinWidth = minBlobWidHei; // method, the can also be queried from the XML reader using OMREnums blobCounter.ProcessImage(bitmapData); Blob[] blobs = blobCounter.GetObjectsInformation(); bitmap.UnlockBits(bitmapData); Graphics g = Graphics.FromImage(bitmap); // Pen yellowPen = new Pen(Color.Yellow, 2); // create pen in case image extraction failes and we need to preview the //blobs that were detected Rectangle[] rects = blobCounter.GetObjectsRectangles(); Blob[] blobs2 = blobCounter.GetObjects(bitmap, false); //Detection of paper lies within the presence of crossmark printed on the corneres of printed sheet. // First, detect left edge. // lc.jpg = Mirrored image sample as located on the corner of printed sheet // this helps filtering out much smaller and much larger blobs depending upon the size of image. // can be queried from XML Reader List <IntPoint> quad = new List <IntPoint>(); // Store sheet corner locations (if anyone is detected ) if (blobs2.GetLength(0) < 4 && contint == 0) { lExtractResult = ExtractResults.FAILED; return(basicImage); } try { foreach (Blob blob in blobs2) { if ( ((double)blob.Area) / ((double)bitmap.Width * bitmap.Height) > minbr && ((double)blob.Area) / ((double)bitmap.Width * bitmap.Height) < maxbr && blob.Rectangle.X < (bitmap.Width) / 4) // filters oout very small or very larg blobs { if ((double)blob.Rectangle.Width / blob.Rectangle.Height < 1.4 && (double)blob.Rectangle.Width / blob.Rectangle.Height > .6) // filters out blobs having insanely wrong aspect ratio { cb1 = UnmanagedImage.FromManagedImage(ImageUtilities.ResizeImage(iMarkLeft, blob.Rectangle.Width, blob.Rectangle.Height)); if (isSame(blob.Image, cb1)) { quad.Add(new IntPoint((int)blob.CenterOfGravity.X, (int)blob.CenterOfGravity.Y)); } } } } } catch (ArgumentException) { lExtractResult = ExtractResults.NOBLOB; } try { // Sort out the list in right sequence, UpperLeft,LowerLeft,LowerRight,upperRight if (quad[0].Y > quad[1].Y) { IntPoint tp = quad[0]; quad[0] = quad[1]; quad[1] = tp; } } catch { } try { foreach (Blob blob in blobs2) { if ( ((double)blob.Area) / ((double)bitmap.Width * bitmap.Height) > minbr && ((double)blob.Area) / ((double)bitmap.Width * bitmap.Height) < maxbr && blob.Rectangle.X > (bitmap.Width * 3) / 4) { if ((double)blob.Rectangle.Width / blob.Rectangle.Height < 1.4 && (double)blob.Rectangle.Width / blob.Rectangle.Height > .6) { cb2 = UnmanagedImage.FromManagedImage(ImageUtilities.ResizeImage(iMarkRight, blob.Rectangle.Width, blob.Rectangle.Height)); if (isSame(blob.Image, cb2)) { quad.Add(new IntPoint((int)blob.CenterOfGravity.X, (int)blob.CenterOfGravity.Y)); } } } } } catch (ArgumentException) { lExtractResult = ExtractResults.NOBLOB; } try { if (quad[2].Y < quad[3].Y) { IntPoint tp = quad[2]; quad[2] = quad[3]; quad[3] = tp; } } catch { } g.Dispose(); //Again, filter out if wrong blobs pretended to our blobs. if (quad.Count == 4) { if (((double)quad[1].Y - (double)quad[0].Y) / ((double)quad[2].Y - (double)quad[3].Y) < .75 || ((double)quad[1].Y - (double)quad[0].Y) / ((double)quad[2].Y - (double)quad[3].Y) > 1.25) { quad.Clear(); // clear if, both edges have insanely wrong lengths } else if (quad[0].X > bitmap.Width / 2 || quad[1].X > bitmap.Width / 2 || quad[2].X < bitmap.Width / 2 || quad[3].X < bitmap.Width / 2) { quad.Clear(); // clear if, sides appear to be "wrong sided" } } if (quad.Count != 4) // sheet not detected, reccurrsive call. { if (contint <= 60) //try altering the contrast correction on both sides of numberline { if (contint >= 0) { contint += 5; contint *= -1; return(ExtractOMRSheet(basicImage, fillint, contint)); } else { contint *= -1; contint += 10; return(ExtractOMRSheet(basicImage, fillint, contint)); } } else // contrast correction yeilded no result { lExtractResult = ExtractResults.FAILED; return(basicImage); } } else // sheet found { IntPoint tp2 = quad[3]; quad[3] = quad[1]; quad[1] = tp2; if (!CheckSheetAR(quad)) { lExtractResult = ExtractResults.INVALIDAR; return(basicImage); } //sort the edges for wrap operation QuadrilateralTransformation wrap = new QuadrilateralTransformation(quad); wrap.UseInterpolation = false; //perspective wrap only, no binary. wrap.AutomaticSizeCalculaton = false; wrap.NewWidth = tSheetSize.Width; wrap.NewHeight = tSheetSize.Height; lExtractResult = ExtractResults.OK; //wrap.Apply(basicImage);//.Save("LastImg.jpg", ImageFormat.Jpeg); // creat file backup for future use. return(wrap.Apply(basicImage)); // wrap } }