public override List <CrawlResult> Process(List <CrawlResult> results)
        {
            try
            {
                var document = PageCrawler.GetPage(this.Url);
                var elements = document.Result.QuerySelectorAll(this.Selector);
                // CrawlResult result = new CrawlResult();
                foreach (var elmnt in elements)
                {
                    CrawlResult result = new CrawlResult();
                    if (elmnt.GetAttribute("href") != null)
                    {
                        result.Url = new AngleSharp.Url(this.Url, elmnt.GetAttribute("href"));
                    }
                    //foreach (var item in this.CrawlItems)
                    //{
                    //    item.Value = elmnt.QuerySelector(item.Selector)?.InnerHtml;
                    //    result.CrawlItems.Add(item);
                    //}
                    results.Add(result);
                }
            }
            catch (Exception e)
            {
                throw e;
            }

            return(results);
        }
 private SetupTeardownIncluder(PageData pageData)
 {
     this.isIssue = pageData;
     testPage = pageData.getWikiPage();
     pageCrawler = testPage.getPageCrawler();
     newPageContent = new StringBuffer();
 }
 private SetupTeardownIncluder(PageData pageData)
 {
     this.isIssue   = pageData;
     testPage       = pageData.getWikiPage();
     pageCrawler    = testPage.getPageCrawler();
     newPageContent = new StringBuffer();
 }
        public override List <CrawlResult> Process(List <CrawlResult> results)
        {
            // CrawlResult result = new CrawlResult();
            foreach (var result in results)
            {
                var document = PageCrawler.GetPage(result.Url).Result;
                // var elements = document.Result.QuerySelectorAll(this.Selector);
                foreach (var item in this.CrawlItems)
                {
                    var _newItem = new CrawlItem {
                        Name = item.Name, Selector = item.Selector, Attr = item.Attr
                    };
                    if (string.IsNullOrWhiteSpace(_newItem.Attr))
                    {
                        _newItem.Value = document.QuerySelector(_newItem.Selector)?.InnerHtml;
                    }
                    else
                    {
                        _newItem.Value = document.QuerySelector(_newItem.Selector)?.GetAttribute(_newItem.Attr);
                    }

                    result.CrawlItems.Add(_newItem);
                }
                // results.Add(result);
            }
            return(results);
        }
Exemplo n.º 5
0
 protected WebSearchCounter(
     HtmlParser htmlParser, 
     PageCrawler crawler)
 {
     this.htmlParser = htmlParser;
     this.crawler = crawler;
 }
Exemplo n.º 6
0
 protected WebSearchCounter(
     HtmlParser htmlParser,
     PageCrawler crawler)
 {
     this.htmlParser = htmlParser;
     this.crawler    = crawler;
 }
Exemplo n.º 7
0
        private static List <MatchClassification> CrawlLocation(CrawlerLinkDetails locationLink, string season)
        {
            List <MatchClassification> matchClassifications = new List <MatchClassification>();
            var taskQueue = new Queue <Task <MatchClassification> >();

            foreach (var link in PageCrawler.CrawlLinksPage(locationLink))
            {
                CrawlerLinkDetails l = link;
                taskQueue.Enqueue(Task <MatchClassification> .Factory.StartNew(() => CrawlClassification(l, l.LinkText, season)));
            }

            Task.Factory.ContinueWhenAll(taskQueue.ToArray(),
                                         completedTasks =>
            {
                foreach (Task <MatchClassification> task in completedTasks)
                {
                    if (task.Exception == null)
                    {
                        matchClassifications.Add(task.Result);
                    }
                    else
                    {
                        Log.Error("Unexpected exception",
                                  task.Exception);
                    }
                }
            })
            .Wait();

            return(matchClassifications);
        }
Exemplo n.º 8
0
        private void btnDebug_Click(object sender, EventArgs e)
        {
            ServicePointManager.DefaultConnectionLimit = 100;
            ServicePointManager.Expect100Continue      = false;
            Dictionary <string, string> pageInfo = new Dictionary <string, string>();

            pageInfo["http://www.66ip.cn/1.html"] = "GB2312";
            //pageInfo["http://ab57.ru/downloads/proxyold.txt"] = "UTF-8";
            //pageInfo["http://www.atomintersoft.com/high_anonymity_elite_proxy_list"] = "UTF-8";
            pageInfo["http://www.data5u.com/"]                = "UTF-8";
            pageInfo["http://www.goubanjia.com/"]             = "UTF-8";
            pageInfo["http://www.ip3366.net/free/?stype=1"]   = "GB2312";
            pageInfo["https://www.kuaidaili.com/free/inha/1"] = "UTF-8";
            //pageInfo["http://www.proxylists.net/http_highanon.txt"] = "UTF-8";
            //pageInfo["https://www.us-proxy.org/"] = "UTF-8";
            pageInfo["http://www.xicidaili.com/nn/"] = "UTF-8";

            pageInfo["http://www.baidu.com"] = "UTF-8";
            //pageInfo["http://wwww.ganji.com"] = "UTF-8";
            pageInfo["http://www.ifeng.com"]  = "UTF-8";
            pageInfo["http://www.douyu.com/"] = "UTF-8";
            //pageInfo["http://www.oschina.net/"] = "UTF-8";
            pageInfo["http://www.cnblogs.com"]               = "UTF-8";
            pageInfo["https://www.tuhu.cn"]                  = "UTF-8";
            pageInfo["http://www.zuojiaju.com/"]             = "UTF-8";
            pageInfo["http://www.new-farmer.com/portal.php"] = "UTF-8";
            pageInfo["http://www.fang.com"]                  = "UTF-8";
            pageInfo["http://www.beiwo.tv"]                  = "UTF-8";

            //var webClient = new WebClient();
            //var pageBytes = webClient.DownloadData("http://www.hao123.com");
            //var page = Encoding.UTF8.GetString(pageBytes);

            //var document = new HtmlParser().Parse(page);
            //var links = document.QuerySelectorAll("a");
            //foreach (var link in links) {
            //    pageInfo[link.GetAttribute("href")] = "UTF-8";
            //}

            var tasks = new List <Task>();

            foreach (var item in pageInfo)
            {
                var pageCrawler = new PageCrawler();
                tasks.Add(new Task(() => pageCrawler.CrawlPage(item.Key, item.Value)));
            }

            var watch = Stopwatch.StartNew();

            foreach (var item in tasks)
            {
                item.Start();
            }

            Task.WaitAll(tasks.ToArray());
            MessageBox.Show(string.Format("速度: {0}", pageInfo.Count / watch.Elapsed.TotalSeconds));
        }
Exemplo n.º 9
0
 private static MatchClassification CrawlClassification(CrawlerLinkDetails classificationLink, string location, string season)
 {
     return(new MatchClassification
     {
         Location = location,
         LocationIndexUrl = classificationLink.SourcePageUrl,
         Name = classificationLink.LinkText,
         Url = classificationLink.DestinationUrl,
         Scorecards = PageCrawler.CrawlMatchListPage(classificationLink, season)
     });
 }
Exemplo n.º 10
0
        public CrawlResults Crawl(Season season)
        {
            Console.Write("Crawling " + season.Name + " ");
            CrawlResults results = new CrawlResults {
                Season = season.Name
            };

            CrawlerLinkDetails seasonPage = new CrawlerLinkDetails
            {
                SourcePageType      = PageType.SeasonList,
                SourcePageUrl       = "http://cricketarchive.com/Archive/Seasons/index.html",
                DestinationPageType = PageType.LocationList,
                DestinationUrl      = season.Url,
                LinkText            = season.Name
            };

            List <CrawlerLinkDetails> locationLinks = PageCrawler.CrawlLinksPage(seasonPage);

            List <MatchClassification> classifications = new List <MatchClassification>();

            var taskQueue = new Queue <Task <List <MatchClassification> > >();

            foreach (var link in locationLinks)
            {
                CrawlerLinkDetails l = link;
                taskQueue.Enqueue(Task <List <MatchClassification> > .Factory.StartNew(() => CrawlLocation(l, season.Name)));
            }

            Task.Factory.ContinueWhenAll(taskQueue.ToArray(),
                                         completedTasks =>
            {
                foreach (Task <List <MatchClassification> > task in completedTasks)
                {
                    if (task.Exception == null)
                    {
                        classifications.AddRange(task.Result);
                    }
                    else
                    {
                        Log.Error("Unexpected exception",
                                  task.Exception);
                    }
                }
            })
            .Wait();

            results.Classifications = classifications;
            Console.WriteLine(" done.");
            return(results);
        }
Exemplo n.º 11
0
 public override List <CrawlResult> Process(List <CrawlResult> results)
 {
     // CrawlResult result = new CrawlResult();
     foreach (var result in results)
     {
         var document = PageCrawler.GetPage(result.Url).Result;
         // var elements = document.Result.QuerySelectorAll(this.Selector);
         foreach (var item in this.CrawlItems)
         {
             // var _newItem = new CrawlItem { Name = item.Name, Selector = item.Selector, Attr = item.Attr };
             int CategoryId = 0;
             if (int.TryParse(item.Value, out CategoryId))
             {
                 result.Categories.Add(CategoryId);
             }
         }
         // results.Add(result);
     }
     return(results);
 }
Exemplo n.º 12
0
 public GoogleWebSearchCounter(
     HtmlParser parser,
     PageCrawler pageCrawler)
     : base(parser, pageCrawler)
 {
 }
Exemplo n.º 13
0
 public BingWebSearchCounter(
     HtmlParser parser, 
     PageCrawler pageCrawler)
     : base(parser, pageCrawler)
 {
 }
Exemplo n.º 14
0
 public YahooWebSearchCounter(
     HtmlParser parser,
     PageCrawler pageCrawler)
     : base(parser, pageCrawler)
 {
 }