示例#1
0
        public void GenStuff()
        {
            PoliteWebCrawler crawler = new PoliteWebCrawler();

            crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;

            //PageObjects are being created as they are asynchronously found during the crawl
            crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;

            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CrawlResult result = crawler.Crawl(new Uri(ConfigurationManager.AppSettings["HomePageURL"]));

            int count = result.CrawlContext.CrawledCount;

            Console.WriteLine(result.CrawlContext.ToJSON());

            Console.WriteLine(result.ToJSON());
            Console.WriteLine("Total Crawled Page Count = " + count);

            ////Parse txt file URLS and Get all page elements from each page and put into a dictionary
            //var xpathElements = CreateXpathsFromUrls();

            ////Get all values that are the same
            //var sameValues = GetSameValues(xpathElements);

            ////get New Elements that do not exist on multiple pages
            //var newElements = GetNewElements(sameValues, xpathElements);

            Console.WriteLine("hello");
        }
示例#2
0
        static void DoCrawl()
        {
            CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert();

            crawlConfig.CrawlTimeoutSeconds  = 100;
            crawlConfig.MaxConcurrentThreads = 10;
            crawlConfig.MaxPagesToCrawl      = 5000;
            crawlConfig.UserAgentString      = "abot v1.0 http://code.google.com/p/abot";
            //crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue1", "1111");
            //crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue2", "2222");

            //Will use app.config for confguration
            PoliteWebCrawler crawler = new PoliteWebCrawler();

            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CrawlResult result = crawler.Crawl(new Uri("http://sunnah.com/"));

            Console.WriteLine("jumlah crawled content :" + result.CrawlContext.CrawledCount);
            if (result.ErrorOccurred)
            {
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            }
            else
            {
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
            }
        }
示例#3
0
 protected AbotRecipeExtractor(PoliteWebCrawler politeWebCrawler, IHtmlRecipeParser htmlRecipeParser, ILogger <AbotRecipeExtractor> logger = null)
 {
     _politeWebCrawler = politeWebCrawler;
     _htmlRecipeParser = htmlRecipeParser;
     Logger            = logger;
     _politeWebCrawler.PageCrawlCompleted += OnPageCrawlCompleted;
 }
示例#4
0
        public WebSpider()
        {
            _crawler = new PoliteWebCrawler();

            _crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            _crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            _crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            _crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                CrawlDecision decision = new CrawlDecision {
                    Allow = true
                };

                var isCrawlDepth1 = pageToCrawl.CrawlDepth == 0 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/s?wd");
                var isCrawlDepth2 = pageToCrawl.CrawlDepth == 1 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/link");

                if (isCrawlDepth1 || isCrawlDepth2)
                {
                    return new CrawlDecision {
                        Allow = false, Reason = "Dont want to crawl google pages"
                    }
                }
                ;

                return(decision);
            });
        }
示例#5
0
        private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler()
        {
            IWebCrawler crawler = new PoliteWebCrawler();

            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                return(new CrawlDecision {
                    Allow =
                        (pageToCrawl.Uri.AbsoluteUri.StartsWith("https://home.treasury.gov") ||
                         pageToCrawl.Uri.AbsoluteUri.StartsWith("https://www.treasury.gov")) &&
                        !pageToCrawl.Uri.AbsoluteUri.EndsWith(".pdf")
                });
            });


            crawler.ShouldDownloadPageContent((crawledPage, crawlContext) =>
            {
                return(new CrawlDecision {
                    Allow = true
                });
            });

            //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri.
            //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run
            crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) =>
            {
                return(new CrawlDecision {
                    Allow = true
                });
            });

            return(crawler);
        }
示例#6
0
        public void Crawl_MinCrawlDelayDelayZero_DomainRateLimiterNotCalled()
        {
            Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            CrawledPage homePage = new CrawledPage(_rootUri)
            {
                RawContent = "content here"
            };
            CrawledPage page1 = new CrawledPage(uri1);
            CrawledPage page2 = new CrawledPage(uri2);

            List <Uri> links = new List <Uri> {
                uri1, uri2
            };

            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(homePage);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page1);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page2);
            _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });

            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            _unitUnderTest.Crawl(_rootUri);

            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Never());
        }
示例#7
0
        public void Crawl_MinCrawlDelayGreaterThanZero_CallsDomainRateLimiter()
        {
            Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            CrawledPage homePage = new CrawledPage(_rootUri)
            {
                RawContent = "content here"
            };
            CrawledPage page1 = new CrawledPage(uri1);
            CrawledPage page2 = new CrawledPage(uri2);

            List <Uri> links = new List <Uri> {
                uri1, uri2
            };

            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(homePage);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page1);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page2);
            _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });

            _dummyConfiguration.MinCrawlDelayPerDomainMilliSeconds = 1;//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            _unitUnderTest.Crawl(_rootUri);

            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
        }
示例#8
0
        private static void Main(string[] args)
        {
            try
            {
                Uri uriToCrawl = GetSiteToCrawl();

                // I'm using the default crawler
                var crawler = new PoliteWebCrawler();

                // I need to subscribe to this event in order to process pages that have been crawled
                crawler.PageCrawlCompletedAsync += ProcessPageCrawlCompleted;

                // Start the crawl
                CrawlResult crawlResult = crawler.Crawl(uriToCrawl);

                // Generate report
                Task <ReportResult> reportTask = GenerateReport();

                PrintResultInformation(reportTask.Result);
            }
            catch (Exception ex)
            {
                System.Console.ForegroundColor = ConsoleColor.Red;
                System.Console.WriteLine("There was an error when trying to crawl page.");
                System.Console.Write(ex);
                System.Console.ReadKey();
            }
        }
示例#9
0
        static void Main(string[] args)
        {
            CrawlConfiguration crawlConfig = new CrawlConfiguration();

            crawlConfig.CrawlTimeoutSeconds  = 100;
            crawlConfig.MaxConcurrentThreads = 1;
            crawlConfig.MaxPagesToCrawl      = 1;


            PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig);


            crawler.PageCrawlStartingAsync  += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
            //crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
            //crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CrawlResult result = crawler.Crawl(new Uri("http://www.kmhk.kmu.edu.tw/news/list.asp?P_classify=9")); //This is synchronous, it will not go to the next line until the crawl has completed

            if (result.ErrorOccurred)
            {
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            }
            else
            {
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
            }
        }
        /// <summary>
        /// 运行爬虫
        /// </summary>

        public void StartCrawl()
        {
            //设置爬虫
            PoliteWebCrawler crawler = new PoliteWebCrawler();

            //设置爬取条件
            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            //开始爬取
            CrawlResult result = crawler.Crawl(new Uri(link)); //This is synchronous, it will not go to the next line until the crawl has completed

            //返回结果
            if (result.ErrorOccurred)
            {
                log.Error("链接" + result.RootUri.AbsoluteUri + "出现差错爬取完成:" + result.ErrorException.Message);
                Console.WriteLine("链接 {0} 出现差错爬取完成: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            }
            else
            {
                log.Info("链接" + result.RootUri.AbsoluteUri + "无差错爬取完成!");
                Console.WriteLine("链接 {0} 无差错爬取完成.", result.RootUri.AbsoluteUri);
            }
            flag = false;
        }
示例#11
0
        public WebsiteIndexer(string host, ICollection <string> ignoredPathes = null, int delayPerRequestMilliSeconds = 1000, int maxPagesToCrawl = 1000)
        {
            _host = host;

            var config = new CrawlConfiguration
            {
                MaxPagesToCrawl = maxPagesToCrawl,
                MinCrawlDelayPerDomainMilliSeconds = delayPerRequestMilliSeconds,
                IsExternalPageCrawlingEnabled      = false
            };

            Crawler = new PoliteWebCrawler(config)
            {
                ShouldCrawlPageDecisionMaker = (pageToCrawl, crawlContext) =>
                {
                    var ignored = string.IsNullOrEmpty(pageToCrawl.Uri?.AbsolutePath) || ignoredPathes?.Any(p => Regex.IsMatch(pageToCrawl.Uri.AbsolutePath, p)) == true;
                    if (ignored)
                    {
                        Console.WriteLine($"Ignored '{pageToCrawl.Uri?.AbsolutePath}'");
                        return(new CrawlDecision {
                            Allow = false, Reason = "Path matches pattern in blacklist"
                        });
                    }

                    return(new CrawlDecision {
                        Allow = true
                    });
                }
            };

            Crawler.PageCrawlCompleted += PageCrawlCompleted;
        }
示例#12
0
        public static void StartCrawlEbuyer(string url)
        {
            try
            {
                PoliteWebCrawler crawler = new PoliteWebCrawler();
                crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
                crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
                crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
                crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;


                TimeSpan ts = new TimeSpan(0, 0, 5);
                CancellationTokenSource cancellationTokenSource = new CancellationTokenSource(ts);
                CrawlResult             result = crawler.Crawl(new Uri(url), cancellationTokenSource);

                if (result.ErrorOccurred)
                {
                    Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
                }
                else
                {
                    Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
                }
            }catch (Exception)
            {
            }
            ExtractingHtml.ExtractDetailsEbuyer();
        }
        public void StartCrawl(string[] pages)
        {
            CrawlConfiguration();

            PoliteWebCrawler crawler = new PoliteWebCrawler(_crawlConfiguration);

            crawler.PageCrawlStartingAsync        += Crawler_PageCrawlStartingAsync;
            crawler.PageCrawlCompletedAsync       += Crawler_PageCrawlCompletedAsync;
            crawler.PageCrawlDisallowedAsync      += Crawler_PageCrawlDisallowedAsync;
            crawler.PageLinksCrawlDisallowedAsync += Crawler_PageLinksCrawlDisallowedAsync;

            foreach (var page in pages)
            {
                _parser.IdentifyParser(page);

                var result = crawler.Crawl(new Uri(page));

                if (result.ErrorOccurred)
                {
                    Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
                }
                else
                {
                    Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
                }

                _parser.Save();
            }
        }
示例#14
0
        public async Task Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_PageIsDisallowed_DoesNotCallHttpRequester()
        {
            var homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent
                {
                    Text = "content here"
                }
            };

            _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(0);
            _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(false);
            _fakeRobotsDotTextFinder.Setup(f => f.FindAsync(It.IsAny <Uri>())).Returns(Task.FromResult(_fakeRobotsDotText.Object));

            _dummyConfiguration.IsRespectRobotsDotTextEnabled = true;
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHtmlParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            await _unitUnderTest.CrawlAsync(_rootUri);

            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(It.IsAny <Uri>(), It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(homePage));
            _fakeRobotsDotText.VerifyAll();
            _fakeRobotsDotTextFinder.VerifyAll();
            _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny <Uri>(), It.IsAny <long>()), Times.Exactly(0));
            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(0));
        }
        public void Crawl(CrawlRequest request)
        {
            CrawlConfiguration crawlConfig = new CrawlConfiguration();

            crawlConfig.CrawlTimeoutSeconds  = 100;
            crawlConfig.MaxConcurrentThreads = 10;
            crawlConfig.MaxPagesToCrawl      = 1000;
            crawlConfig.UserAgentString      = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; abot v1.0 http://code.google.com/p/abot)";
            crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue1", "1111");
            crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue2", "2222");
            crawlConfig.MaxCrawlDepth            = 10;
            crawlConfig.DownloadableContentTypes = "text/html, text/plain";

            //Will use the manually created crawlConfig object created above
            PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null);

            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CrawlResult result = crawler.Crawl(new Uri(request.EntryURL));

            if (result.ErrorOccurred)
            {
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            }
            else
            {
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
            }
        }
示例#16
0
        public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_PageIsDisallowed_IsIgnoreRobotsDotTextIfRootDisallowedEnabledTrue_CallsHttpRequester()
        {
            CrawledPage homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent
                {
                    Text = "content here"
                }
            };
            CrawledPage page1 = new CrawledPage(_rootUri);

            _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(false);
            _fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny <Uri>())).Returns(_fakeRobotsDotText.Object);
            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page1);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _dummyConfiguration.IsRespectRobotsDotTextEnabled = true;
            _dummyConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled = true;
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            _unitUnderTest.Crawl(_rootUri);

            _fakeCrawlDecisionMaker.VerifyAll();
            _fakeRobotsDotText.VerifyAll();
            _fakeRobotsDotTextFinder.VerifyAll();
            _fakeHttpRequester.VerifyAll();
        }
示例#17
0
        public void Crawl_IsRateLimited()
        {
            new PageRequester(new CrawlConfiguration {
                UserAgentString = "aaa"
            }).MakeRequest(new Uri("http://localhost.fiddler:1111/PageGenerator/ClearCounters"));

            CrawlConfiguration configuration = new CrawlConfiguration();

            configuration.MaxPagesToCrawl = 3;
            configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; // 1 second * 2 pages = 2 (or more) seconds

            int pagesCrawledCount = 0;

            var crawler = new PoliteWebCrawler(configuration);

            crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++;

            var uriToCrawl = new Uri("http://localhost.fiddler:1111/");
            var start      = DateTime.Now;

            crawler.Crawl(uriToCrawl);
            var elapsed = DateTime.Now - start;

            Assert.GreaterOrEqual(elapsed.TotalMilliseconds, 2000);
            Assert.AreEqual(3, pagesCrawledCount);
        }
示例#18
0
        static void Main(string[] args)
        {
            CrawlConfiguration config = new CrawlConfiguration();

            config.MaxConcurrentThreads = 1; // Web Extractor is not currently thread-safe.

            // Create the PhantomJS instance. This will spawn a new PhantomJS process using phantomjs.exe.
            // Make sure to dispose this instance or you will have a zombie process!
            IWebDriver driver = CreatePhantomJsDriver(config);

            // Create the content extractor that uses PhantomJS.
            IWebContentExtractor extractor = new JavaScriptContentExtractor(driver);

            // Create a PageRequester that will use the extractor.
            IPageRequester requester = new PageRequester(config, extractor);

            using (IWebCrawler crawler = new PoliteWebCrawler(config, null, null, null, requester, null, null, null, null)) {
                crawler.PageCrawlCompleted += OnPageCrawlCompleted;

                CrawlResult result = crawler.Crawl(new Uri("http://wvtesting2.com/"));
                if (result.ErrorOccurred)
                {
                    Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
                }
                else
                {
                    Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
                }
            }

            Console.Read();
        }
示例#19
0
        public static void Main(string[] args)
        {
            PoliteWebCrawler crawler = new PoliteWebCrawler();

            crawler.PageCrawlCompletedAsync += Crawler_ProcessPageCrawlCompleted;
            var         start  = DateTime.Now;
            var         uri    = new Uri("https://lord.technology");
            CrawlResult result = crawler.Crawl(uri);

            if (result.ErrorOccurred)
            {
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            }
            else
            {
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
            }
            var finish = DateTime.Now;

            Console.WriteLine((finish - start).TotalMinutes);

            using (FileStream fs = File.Open(@"./crawl.json", FileMode.Create))
                using (StreamWriter sw = new StreamWriter(fs))
                    using (JsonWriter jw = new JsonTextWriter(sw))
                    {
                        jw.Formatting = Formatting.Indented;
                        JsonSerializer serializer = new JsonSerializer();
                        serializer.Serialize(jw, new { nodes = _pages, edges = _relationships });
                    }
        }
示例#20
0
        public async Task Crawl_MaxPagesTo25_OnlyCrawls25Pages()
        {
            await new PageRequester(new CrawlConfiguration {
                UserAgentString = "aaa"
            }).MakeRequestAsync(new Uri("http://localhost:1111/PageGenerator/ClearCounters"));

            CrawlConfiguration configuration = new CrawlConfiguration();

            configuration.MaxPagesToCrawl = 25;
            configuration.IsExternalPageCrawlingEnabled      = true;
            configuration.IsExternalPageLinksCrawlingEnabled = true;

            int pagesCrawledCount = 0;

            PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null);

            crawler.PageCrawlCompleted += (a, b) =>
            {
                pagesCrawledCount++;
            };

            var res = await crawler.CrawlAsync(new Uri("http://localhost:1111/"));

            Assert.AreEqual(25, pagesCrawledCount);
        }
示例#21
0
//Crawling code for GSM
        public static void StartCrawlGSM(string url)
        {
            PoliteWebCrawler crawler = new PoliteWebCrawler();

            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStartingGSM;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompletedGSM;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowedGSM;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowedGSM;


            TimeSpan ts = new TimeSpan(0, 0, 0);
            CancellationTokenSource cancellationTokenSource = new CancellationTokenSource(ts);
            CrawlResult             result = crawler.Crawl(new Uri(url), cancellationTokenSource);

            if (result.ErrorOccurred)
            {
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            }
            else
            {
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
            }


            //FileStream fs = new FileStream("url.txt", FileMode.Open);
            //StreamReader sr = new StreamReader(fs);
            //string str = "";
            //while ((str = sr.ReadLine()) != null)
            //{
            //    StartCrawl(str);
            //}

            ExtractingHtml.ExtractingDetailsGSM();
        }
示例#22
0
        public async Task Crawl(string rootUri, int maxPages)
        {
            try
            {
                PoliteWebCrawler crawler = new PoliteWebCrawler(CreateCrawlConfiguration(maxPages), null, null, null, null, null, null, null, null);

                crawler.PageCrawlStartingAsync  += crawler_ProcessPageCrawlStarting;
                crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;

                CrawlResult result = crawler.Crawl(new Uri(rootUri)); //This is synchronous, it will not go to the next line until the crawl has completed
                if (result.ErrorOccurred)
                {
                    Console.WriteLine("Crawl of {0} ({1} pages) completed with error: {2}", result.RootUri.AbsoluteUri, PageCount, result.ErrorException.Message);
                }
                else
                {
                    Console.WriteLine("Crawl of {0} ({1} pages) completed without error.", result.RootUri.AbsoluteUri, PageCount);
                }

                await _handler.CrawlFinishedAsync();
            }
            catch (Exception e)

            {  }
        }
示例#23
0
        public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_PageIsDisallowed_DoesNotCallHttpRequester()
        {
            Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            CrawledPage homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent
                {
                    Text = "content here"
                }
            };
            CrawledPage page1 = new CrawledPage(uri1);
            CrawledPage page2 = new CrawledPage(uri2);

            List <Uri> links = new List <Uri> {
                uri1, uri2
            };

            _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(0);
            _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(false);
            _fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny <Uri>())).Returns(_fakeRobotsDotText.Object);

            _dummyConfiguration.IsRespectRobotsDotTextEnabled = true;
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            _unitUnderTest.Crawl(_rootUri);

            _fakeHttpRequester.Setup(f => f.MakeRequest(It.IsAny <Uri>(), It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(homePage);
            _fakeRobotsDotText.VerifyAll();
            _fakeRobotsDotTextFinder.VerifyAll();
            _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny <Uri>(), It.IsAny <long>()), Times.Exactly(0));
            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(0));
        }
 public KwestiaSmakuRecipeExtractor(
     PoliteWebCrawler politeWebCrawler,
     IHtmlRecipeParser htmlRecipeParser,
     ILogger <AbotRecipeExtractor> logger = null)
     : base(politeWebCrawler, htmlRecipeParser, logger)
 {
 }
示例#25
0
        public void Test(Uri uri)
        {
            pageCount = 0;
            baseUri   = uri;
            string message;

            CrawlConfiguration crawlConfiguration = new CrawlConfiguration();

            crawlConfiguration.MaxConcurrentThreads = 4;
            crawlConfiguration.UserAgentString      =
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
                "AppleWebKit/537.36 (KHTML, like Gecko) " +
                "Chrome/60.0.3112.113 Safari/537.36 bot";
            crawlConfiguration.MaxPagesToCrawl          = 10000;
            crawlConfiguration.DownloadableContentTypes =
                "text/html, text/plain, image/jpeg, image/pjpeg, image/png";
            crawlConfiguration.CrawlTimeoutSeconds = 100;
            crawlConfiguration.MinCrawlDelayPerDomainMilliSeconds = 1000;

            using PoliteWebCrawler crawler =
                      new PoliteWebCrawler(crawlConfiguration);

            crawler.PageCrawlStarting  += ProcessPageCrawlStarted;
            crawler.PageCrawlCompleted += ProcessPageCrawlCompleted;

            CrawlResult result = crawler.CrawlAsync(baseUri).Result;

            if (result.ErrorOccurred)
            {
                message = StringTable.GetString(
                    "CRAWL_COMPLETE_ERROR",
                    CultureInfo.InstalledUICulture);

                Log.InfoFormat(
                    CultureInfo.InvariantCulture,
                    message,
                    result.RootUri.AbsoluteUri,
                    result.ErrorException.Message);
            }
            else
            {
                message = StringTable.GetString(
                    "CRAWL_COMPLETE_NO_ERROR",
                    CultureInfo.InstalledUICulture);

                Log.InfoFormat(
                    CultureInfo.InvariantCulture,
                    message,
                    result.RootUri.AbsoluteUri);
            }

            message = StringTable.GetString(
                "TOTAL_PAGES",
                CultureInfo.InstalledUICulture);
            Log.InfoFormat(
                CultureInfo.InvariantCulture,
                message,
                pageCount.ToString(CultureInfo.InvariantCulture));
        }
示例#26
0
        private static async Task DemoSimpleCrawler()
        {
            var config = new CrawlConfiguration
            {
                UserAgentString = "2019RLCrawlAThon",
                MaxPagesToCrawl = 0,
                MinCrawlDelayPerDomainMilliSeconds = 10,
            };
            var start   = new Uri("https://thailand.kyocera.com/");
            var crawler = new PoliteWebCrawler(
                config,
                new BetterDecisionMaker(start),
                null,
                new Scheduler(false, null, new PriorityUriRepository()),
                null,
                null,
                null,
                null,
                null);

            var files    = new HashSet <string>();
            var decMaker = new CrawlDecisionMaker();
            var batch    = new HashSet <string>();

            crawler.PageCrawlCompleted += Crawler_PageCrawlCompleted;
            crawler.PageCrawlCompleted += (sender, e) =>
            {
                if (new[] { ".exe", ".zip", ".tar" }.Any(c => e.CrawledPage.Uri.AbsolutePath.Contains(c)))
                {
                    lock (files)
                    {
                        Console.WriteLine("Found file: " + e.CrawledPage.Uri.Host + e.CrawledPage.Uri.LocalPath);
                        Console.WriteLine(e.CrawledPage.CrawlDepth);
                        if (!files.Contains(e.CrawledPage.Uri.ToString()))
                        {
                            files.Add(e.CrawledPage.Uri.ToString());
                            batch.Add(e.CrawledPage.Uri.ToString());
                            if (batch.Count >= 10)
                            {
                                using (var httpClient = new HttpClient())
                                {
                                    using (var request = new HttpRequestMessage(new HttpMethod("POST"), "http://hackathon.reversinglabs.com/api/test/bulk"))
                                    {
                                        var base64authorization = Convert.ToBase64String(Encoding.ASCII.GetBytes("tztok_jadnici:7@dQ6dqq7YZggcd"));
                                        request.Headers.TryAddWithoutValidation("Authorization", $"Basic {base64authorization}");

                                        var body = "{\"crawlathon\": {\"query\": {\"site\": \"filehippo\", \"links\":[" + string.Join(", ", batch.Select(s => "\"" + s + "\"")) + "]}}}";
                                        request.Content = new StringContent(body, Encoding.UTF8, "application/json");
                                        var resp = httpClient.SendAsync(request).Result;
                                        batch.Clear();
                                    }
                                }
                            }
                        }
                    }
                }
            };
            var crawlResult = await crawler.CrawlAsync(start);
        }
示例#27
0
        public async Task Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_CrawlDelayAboveMinDomainCrawlDelay_CallsDomainRateLimiter()
        {
            var uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            var uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            var homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent
                {
                    Text = "content here"
                }
            };
            var page1 = new CrawledPage(uri1);
            var page2 = new CrawledPage(uri2);

            var links = new List <HyperLink>
            {
                new HyperLink()
                {
                    HrefValue = uri1
                },
                new HyperLink()
                {
                    HrefValue = uri2
                }
            };

            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(homePage));
            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page1));
            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page2));
            _fakeHtmlParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldRecrawlPage(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = false
            });

            _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(3);//this is more then the max configured crawl delay (should be ignored)
            _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(true);
            _fakeRobotsDotTextFinder.Setup(f => f.FindAsync(It.IsAny <Uri>())).Returns(Task.FromResult(_fakeRobotsDotText.Object));

            _dummyConfiguration.IsRespectRobotsDotTextEnabled       = true; //BY HAVING A THIS EQUAL TO TRUE WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
            _dummyConfiguration.MaxRobotsDotTextCrawlDelayInSeconds = 2;    //This is less than the crawl delay (Should Be used)
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHtmlParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            await _unitUnderTest.CrawlAsync(_rootUri);

            _fakeHttpRequester.VerifyAll();
            _fakeHtmlParser.VerifyAll();
            _fakeRobotsDotText.VerifyAll();
            _fakeRobotsDotTextFinder.VerifyAll();
            _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny <Uri>(), 2000), Times.Exactly(1));
            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
        }
示例#28
0
        static void Main(string[] args)
        {
            SiteMapFinder    finder  = new SiteMapFinder();
            PoliteWebCrawler crawler = new PoliteWebCrawler(null, null, null, null, null, finder, null, null, null);


            crawler.PageCrawlCompleted += Crawler_PageCrawlCompleted;
            CrawlResult result = crawler.Crawl(new Uri("http://tenders.rfpalertservices.com/sitemap/"));
        }
示例#29
0
        public async Task Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_ZeroCrawlDelay_StillCallsDomainRateLimiter()
        {
            var uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            var uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            var homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent
                {
                    Text = "content here"
                }
            };
            var page1 = new CrawledPage(uri1);
            var page2 = new CrawledPage(uri2);

            var links = new List <HyperLink>
            {
                new HyperLink()
                {
                    HrefValue = uri1
                },
                new HyperLink()
                {
                    HrefValue = uri2
                }
            };

            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(homePage));
            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page1));
            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page2));
            _fakeHtmlParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldRecrawlPage(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = false
            });

            _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(0);
            _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(true);
            _fakeRobotsDotTextFinder.Setup(f => f.FindAsync(It.IsAny <Uri>())).Returns(Task.FromResult(_fakeRobotsDotText.Object));

            _dummyConfiguration.IsRespectRobotsDotTextEnabled = true;
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHtmlParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            await _unitUnderTest.CrawlAsync(_rootUri);

            _fakeHttpRequester.VerifyAll();
            _fakeHtmlParser.VerifyAll();
            _fakeRobotsDotText.VerifyAll();
            _fakeRobotsDotTextFinder.VerifyAll();
            _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny <Uri>(), It.IsAny <long>()), Times.Exactly(0));
            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(3));
        }
示例#30
0
 public void Constructor_ZeroMinCrawlDelay_DoesNotThrowExceptionCreatingAnIDomainRateLimiterWithLessThan1Millisec()
 {
     using (var unused = new PoliteWebCrawler(new CrawlConfiguration {
         MinCrawlDelayPerDomainMilliSeconds = 0
     },
                                              null, null, null, null, null, null, null, null))
     {
     }
 }