コード例 #1
0
        public override void OnHandel(ExtractResults results)
        {
            Logger.Info("开始保存");
            var doc = new BsonDocument();

            foreach (var r in results)
            {
                if (!r.Skip)
                {
                    doc.Add(r.Key, r.Value);
                }
            }
            Logger.Info("保存数量:" + doc.ElementCount);
            try
            {
                if (doc.ElementCount > 0)
                {
                    _collection.InsertOne(doc);
                }
            }
            catch (Exception e)
            {
                Logger.Error(e, "保存出现问题");
            }

            Logger.Info("保存结束");
        }
コード例 #2
0
        public override void OnHandel(ExtractResults results)
        {
            var con = new MySqlConnection(Crawler.MysqlConfig.ConString);

            con.Open();
            con.ChangeDatabase(DatabaseName);
            var cmd = new MySqlCommand {
                Connection = con
            };

            //建立文本

            var keys   = "timestamp,cname";
            var values = $"\"{ results.Timestamp}\",\"{Config.Name}\"";

            foreach (var result in results)
            {
                keys   += $",{result.Key}";
                values += $",\"{result.Value}\"";
            }
            try
            {
                cmd.CommandText = $"INSERT INTO {DataTableName}({keys}) VALUES({values})";
                cmd.ExecuteNonQuery();
            }
            catch (Exception e)
            {
                Logger.Error("sql保存错误:" + e.Message);
            }
            con.Close();
        }
コード例 #3
0
        protected void Extract(Page page)
        {
            var results = new ExtractResults();

            foreach (var field in Config.Fields)
            {
                try
                {
                    string source;
                    switch (field.SourceType)
                    {
                    case SourceType.Page:
                        source = page.Html;
                        break;

                    case SourceType.AttachedUrl:
                        throw new NotImplementedException();

                    case SourceType.UrlContext:
                        source = page.Request.Url;
                        break;

                    default:
                        throw new ArgumentOutOfRangeException();
                    }

                    Result result;
                    switch (field.Selectortype)
                    {
                    case SelectorType.JsonPath:
                        result = DoJson(source, field);
                        break;

                    case SelectorType.XPath:
                        result = DoHtml(source, field);
                        break;

                    case SelectorType.Regex:
                        result = DoRegex(source, field);
                        break;

                    default:
                        throw new ArgumentOutOfRangeException();
                    }

                    results.Add(result);
                }
                catch (Exception e)
                {
                    Logger.Error($"{page.Request.Url} 抽取 {field.Selectortype} {field.Name} 失败 \r\n{e}");
                    FailCount++;
                    return;
                }
            }

            page.Results.Add(results);
            //AfterExtractField?.Invoke(page, result);
        }
コード例 #4
0
        static void Main(string[] args)
        {
            var config = new Config
            {
                Name     = "longzhu",
                ScanUrls = "http://api.plu.cn/tga/streams?max-results=50&start-index=0&sort-by=views&filter=0&game=0",
                Fields   = new[]
                {
                    new Field
                    {
                        Name         = "title",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.channel.status"
                    },
                    new Field
                    {
                        Name         = "username",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.channel.name"
                    },
                    new Field
                    {
                        Name         = "online",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.viewers",
                        Type         = FieldType.Int,
                    },
                    new Field
                    {
                        Name         = "fanscount",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.channel.followers",
                        Type         = FieldType.Int,
                    },
                    new Field
                    {
                        Name         = "cate",
                        Selector     = "$.game[0].name",
                        Selectortype = SelectorType.JsonPath
                    }
                },
                RepeatWhen = RepeatWhenEver.hour,
                RepeatAt   = new TimeSpan(0, 25, 0),
            };

            crawler = new Crawler();



            crawler.Downloader.AfterDownloadPage = p =>
            {
            };
            crawler.Processor.OnCustomExtract = p =>
            {
                var j  = JObject.Parse(p.Html);
                var jr = JArray.FromObject(j["data"]["items"]);


                for (int i = 0; i < jr.Count; i++)
                {
                    var exres = new ExtractResults();
                    var info  = jr[i];
                    foreach (var f in config.Fields)
                    {
                        var res = new Result(f.Name, info.SelectToken(f.Selector).ToString());
                        exres.Add(res);
                    }
                    p.Results.Add(exres);
                }
            };
            crawler.Processor.OnProcessScanPage = p =>
            {
                var totalcount = p.GetJson("$.data.totalItems");
                var pagecount  = int.Parse(totalcount) / 50 + 1;

                for (int i = 1; i <= pagecount; i++)
                {
                    crawler.Schduler.AddUrl($"http://api.plu.cn/tga/streams?max-results=200&start-index={i * 50}&sort-by=views&filter=0&game=0");
                }
            };
            crawler.Setup(config);
            crawler.Start();
            Console.WriteLine("end");
            Console.ReadKey();
        }
コード例 #5
0
        static void Main(string[] args)
        {
            var config = new Config
            {
                Name     = "chushou",
                ScanUrls = "https://chushou.tv/live/down-v2.htm",
                Fields   = new[]
                {
                    new Field
                    {
                        Name         = "title",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.name"
                    },
                    new Field
                    {
                        Name         = "username",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.meta.creator"
                    },
                    new Field
                    {
                        Name         = "online",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.meta.onlineCount",
                        Type         = FieldType.Int,
                    },
                    new Field
                    {
                        Name         = "fanscount",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.meta.subscriberCount",
                        Type         = FieldType.Int,
                    },
                    new Field
                    {
                        Name         = "cate",
                        Selector     = "$.meta.gameName",
                        Selectortype = SelectorType.JsonPath
                    }
                },
                RepeatWhen = RepeatWhenEver.hour,
                RepeatAt   = new TimeSpan(0, 35, 0),
            };

            crawler = new CrawlerDotNet.Core.Crawler();
            string lastpoint = "";

            crawler.Processor.OnProcessScanPage = p =>
            {
                var point = p.GetJson("$.data.breakpoint");
                crawler.Schduler.AddUrl("https://chushou.tv/live/down-v2.htm?&breakpoint=" + point, point != lastpoint ? PageType.ScanUrl : PageType.ContextUrl);
                lastpoint = point;
            };
            crawler.Processor.OnCustomExtract = p =>
            {
                var j  = JObject.Parse(p.Html);
                var jr = JArray.FromObject(j["data"]["items"]);


                for (int i = 0; i < jr.Count; i++)
                {
                    var exres = new ExtractResults();
                    var info  = jr[i];
                    foreach (var f in config.Fields)
                    {
                        var res = new Result(f.Name, info.SelectToken(f.Selector).ToString());
                        exres.Add(res);
                    }
                    p.Results.Add(exres);
                }
            };
            crawler.Setup(config);
            crawler.Start();
            Console.WriteLine("end");
            Console.ReadKey();
        }
コード例 #6
0
 public virtual void OnHandel(ExtractResults results)
 {
 }
コード例 #7
0
        static void Main(string[] args)
        {
            var config = new Config
            {
                Name     = "zhanqi",
                ScanUrls = "http://www.zhanqi.tv/api/static/v2.1/live/list/200/1.json",
                Fields   = new[]
                {
                    new Field
                    {
                        Name         = "title",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.title"
                    },
                    new Field
                    {
                        Name         = "username",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.nickname"
                    },
                    new Field
                    {
                        Name         = "online",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.online",
                        Type         = FieldType.Int,
                    },
                    new Field
                    {
                        ////*[@id="js-room-anchor-info-area"]/div[2]/div[1]/div/span[1]
                        Name         = "fanscount",
                        Selectortype = SelectorType.Regex,
                        Selector     = "js-room-follow-num\">([0-9]*)<",
                        Type         = FieldType.Int,
                    },
                    new Field
                    {
                        Name         = "cate",
                        Selector     = "$.newGameName",
                        Selectortype = SelectorType.JsonPath
                    },
                    new Field
                    {
                        Name         = "childcate",
                        Selector     = "$.gameName",
                        Selectortype = SelectorType.JsonPath
                    }
                },
                RepeatWhen = RepeatWhenEver.hour,
                RepeatAt   = new TimeSpan(0, 10, 0),
            };

            crawler = new CrawlerDotNet.Core.Crawler();

            var curPage = 1;

            crawler.BeforeCrawl = () =>
            {
                curPage = 1;
            };

            crawler.Downloader.AfterDownloadPage = p =>
            {
                //是不是有数据.有数据加入下一个json

                var rooms = p.GetJson("$.data.rooms");

                if (rooms != "[]")
                {
                    curPage++;
                    crawler.Schduler.AddUrl($"http://www.zhanqi.tv/api/static/v2.1/live/list/200/{curPage}.json");
                }
            };
            crawler.Processor.OnCustomExtract = p =>
            {
                var j  = JObject.Parse(p.Html);
                var jr = JArray.FromObject(j["data"]["rooms"]);


                for (int i = 0; i < jr.Count; i++)
                {
                    var exres = new ExtractResults();
                    var info  = jr[i];
                    foreach (var f in config.Fields)
                    {
                        if (f.Name == "fanscount")
                        {
                            //请求订阅


                            var fanspage = crawler.Downloader.DownloaderOnly(new Request(crawler.Schduler)
                            {
                                Url =
                                    "https://www.zhanqi.tv" +
                                    info.SelectToken("$.url").ToString()
                            });
                            var r = BaseProcessor.DoRegex(fanspage.Html, f);
                            if (r.Value == "")
                            {
                                r.Value = "0";
                            }
                            exres.Add(r);

                            continue;
                        }



                        var res = new Result(f.Name, info.SelectToken(f.Selector).ToString());
                        exres.Add(res);
                    }
                    p.Results.Add(exres);
                }
            };
            crawler.Setup(config);
            crawler.Start();

            Console.WriteLine("end");
            Console.ReadKey();
        }
コード例 #8
0
        static void Main(string[] args)
        {
            #region c
            var c = new Config
            {
                Name     = "panda",
                ScanUrls = "https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno=1&pagenum=120",

                ContentUrlRegexes = new Regex("live_lists"),
                HelperUrlRegexes  = new Regex("789987"),

                Fields = new[]
                {
                    new Field
                    {
                        Name         = "title",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.name"
                    }, new Field
                    {
                        Name         = "username",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.userinfo.nickName"
                    }, new Field
                    {
                        Name         = "online",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.person_num",
                        Type         = FieldType.Int,
                    }, new Field
                    {
                        Name         = "fanscount",
                        Selectortype = SelectorType.JsonPath,
                        Selector     = "$.data.fans",
                        Type         = FieldType.Int,
                    }, new Field
                    {
                        Name         = "cate",
                        Selector     = "$.classification.cname",
                        Selectortype = SelectorType.JsonPath
                    }
                },
                RepeatAt = new TimeSpan(0, 30, 0),
            };
            #endregion
            crawler = new Crawler();
            ////https://www.panda.tv/room_followinfo?token=&roomid=1042806&_=1509522885105
            //https://www.panda.tv/1042806
            //https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno=3&pagenum=120&_=1509525309865

            crawler.Processor.OnCustomExtract = p =>
            {
                var j = JObject.Parse(p.Html);

                for (int i = 0; i < 120; i++)
                {
                    var roominfo = j.SelectToken($"$.data.items[{i}]");
                    if (roominfo == null)
                    {
                        break;
                    }
                    var exres = new ExtractResults();

                    foreach (var f in c.Fields)
                    {
                        if (f.Name == "fanscount")
                        {
                            //请求订阅


                            var fanspage = crawler.Downloader.DownloaderOnly(new Request(crawler.Schduler)
                            {
                                Url =
                                    "https://www.panda.tv/room_followinfo?token=&roomid=" +
                                    roominfo.SelectToken("$.id").ToString()
                            });
                            var r = BaseProcessor.DoJson(fanspage.Html, f);
                            exres.Add(r);

                            continue;
                        }


                        var res = new Result(f.Name, roominfo.SelectToken(f.Selector).ToString());
                        exres.Add(res);
                    }



                    p.Results.Add(exres);
                }
            };

            crawler.Processor.OnProcessScanPage = p =>
            {
                //*[@id="pages-container"]/div/div/a[7]
                var total = int.Parse(p.GetJson("$.data.total"));

                var pageconut = total / 120 + (total % 120 > 0 ? 1 : 0);

#if DEBUG
                pageconut = 1;
#endif

                for (int i = 1; i <= pageconut; i++)
                {
                    crawler.Schduler.AddUrl($"https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno={i}&pagenum=120");
                }
            };
            crawler.Setup(c);
            crawler.Start();

            Console.ReadLine();
        }
コード例 #9
0
        public Bitmap ExtractPaperFromFlattened(Bitmap bitmap, Bitmap basicImage, int minBlobWidHei, int fillint, int contint)
        {
            // lock image, Bitmap itself takes much time to be processed
            BitmapData bitmapData = bitmap.LockBits(
                new Rectangle(0, 0, bitmap.Width, bitmap.Height),
                ImageLockMode.ReadWrite, bitmap.PixelFormat);
            // step 2 - locating objects
            BlobCounter blobCounter = new BlobCounter();

            blobCounter.FilterBlobs = true;
            blobCounter.MinHeight   = minBlobWidHei; // both these variables have to be given when calling the
            blobCounter.MinWidth    = minBlobWidHei; // method, the can also be queried from the XML reader using OMREnums
            blobCounter.ProcessImage(bitmapData);
            Blob[] blobs = blobCounter.GetObjectsInformation();
            bitmap.UnlockBits(bitmapData);
            Graphics g = Graphics.FromImage(bitmap);

//            Pen yellowPen = new Pen(Color.Yellow, 2);   // create pen in case image extraction failes and we need to preview the
            //blobs that were detected
            Rectangle[] rects  = blobCounter.GetObjectsRectangles();
            Blob[]      blobs2 = blobCounter.GetObjects(bitmap, false);
            //Detection of paper lies within the presence of crossmark printed on the corneres of printed sheet.
            // First, detect left edge.
            // lc.jpg = Mirrored image sample as located on the corner of printed sheet
            // this helps filtering out much smaller and much larger blobs depending upon the size of image.
            // can be queried from XML Reader
            List <IntPoint> quad = new List <IntPoint>(); // Store sheet corner locations (if anyone is detected )

            if (blobs2.GetLength(0) < 4 && contint == 0)
            {
                lExtractResult = ExtractResults.FAILED;
                return(basicImage);
            }
            try
            {
                foreach (Blob blob in blobs2)
                {
                    if (
                        ((double)blob.Area) / ((double)bitmap.Width * bitmap.Height) > minbr &&
                        ((double)blob.Area) / ((double)bitmap.Width * bitmap.Height) < maxbr &&
                        blob.Rectangle.X < (bitmap.Width) / 4)     // filters oout very small or very larg blobs
                    {
                        if ((double)blob.Rectangle.Width / blob.Rectangle.Height < 1.4 &&
                            (double)blob.Rectangle.Width / blob.Rectangle.Height > .6) // filters out blobs having insanely wrong aspect ratio
                        {
                            cb1 = UnmanagedImage.FromManagedImage(ImageUtilities.ResizeImage(iMarkLeft, blob.Rectangle.Width, blob.Rectangle.Height));
                            if (isSame(blob.Image, cb1))
                            {
                                quad.Add(new IntPoint((int)blob.CenterOfGravity.X, (int)blob.CenterOfGravity.Y));
                            }
                        }
                    }
                }
            }
            catch (ArgumentException) { lExtractResult = ExtractResults.NOBLOB; }
            try
            { // Sort out the list in right sequence, UpperLeft,LowerLeft,LowerRight,upperRight
                if (quad[0].Y > quad[1].Y)
                {
                    IntPoint tp = quad[0];
                    quad[0] = quad[1];
                    quad[1] = tp;
                }
            }
            catch
            {
            }
            try
            {
                foreach (Blob blob in blobs2)
                {
                    if (
                        ((double)blob.Area) / ((double)bitmap.Width * bitmap.Height) > minbr &&
                        ((double)blob.Area) / ((double)bitmap.Width * bitmap.Height) < maxbr &&
                        blob.Rectangle.X > (bitmap.Width * 3) / 4)
                    {
                        if ((double)blob.Rectangle.Width / blob.Rectangle.Height < 1.4 &&
                            (double)blob.Rectangle.Width / blob.Rectangle.Height > .6)
                        {
                            cb2 = UnmanagedImage.FromManagedImage(ImageUtilities.ResizeImage(iMarkRight, blob.Rectangle.Width, blob.Rectangle.Height));
                            if (isSame(blob.Image, cb2))
                            {
                                quad.Add(new IntPoint((int)blob.CenterOfGravity.X, (int)blob.CenterOfGravity.Y));
                            }
                        }
                    }
                }
            }
            catch (ArgumentException) { lExtractResult = ExtractResults.NOBLOB; }
            try
            {
                if (quad[2].Y < quad[3].Y)
                {
                    IntPoint tp = quad[2];
                    quad[2] = quad[3];
                    quad[3] = tp;
                }
            }
            catch
            {
            }
            g.Dispose();
            //Again, filter out if wrong blobs pretended to our blobs.
            if (quad.Count == 4)
            {
                if (((double)quad[1].Y - (double)quad[0].Y) / ((double)quad[2].Y - (double)quad[3].Y) < .75 ||
                    ((double)quad[1].Y - (double)quad[0].Y) / ((double)quad[2].Y - (double)quad[3].Y) > 1.25)
                {
                    quad.Clear(); // clear if, both edges have insanely wrong lengths
                }
                else if (quad[0].X > bitmap.Width / 2 || quad[1].X > bitmap.Width / 2 || quad[2].X < bitmap.Width / 2 || quad[3].X < bitmap.Width / 2)
                {
                    quad.Clear(); // clear if, sides appear to be "wrong sided"
                }
            }
            if (quad.Count != 4)   // sheet not detected, reccurrsive call.
            {
                if (contint <= 60) //try altering the contrast correction on both sides of numberline
                {
                    if (contint >= 0)
                    {
                        contint += 5;
                        contint *= -1;
                        return(ExtractOMRSheet(basicImage, fillint, contint));
                    }
                    else
                    {
                        contint *= -1;
                        contint += 10;
                        return(ExtractOMRSheet(basicImage, fillint, contint));
                    }
                }
                else // contrast correction yeilded no result
                {
                    lExtractResult = ExtractResults.FAILED;
                    return(basicImage);
                }
            }
            else // sheet found
            {
                IntPoint tp2 = quad[3];
                quad[3] = quad[1];
                quad[1] = tp2;

                if (!CheckSheetAR(quad))
                {
                    lExtractResult = ExtractResults.INVALIDAR;
                    return(basicImage);
                }

                //sort the edges for wrap operation
                QuadrilateralTransformation wrap = new QuadrilateralTransformation(quad);
                wrap.UseInterpolation        = false; //perspective wrap only, no binary.
                wrap.AutomaticSizeCalculaton = false;
                wrap.NewWidth  = tSheetSize.Width;
                wrap.NewHeight = tSheetSize.Height;
                lExtractResult = ExtractResults.OK;
                //wrap.Apply(basicImage);//.Save("LastImg.jpg", ImageFormat.Jpeg); // creat file backup for future use.
                return(wrap.Apply(basicImage)); // wrap
            }
        }