public async Task Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_RootPageIsAllowed_AllPagesBelowDisallowed_IsIgnoreRobotsDotTextIfRootDisallowedEnabledTrue_CallsHttpRequester()
        {
            CrawledPage homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent
                {
                    Text = "content here"
                }
            };
            CrawledPage page1 = new CrawledPage(_rootUri);

            _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(_rootUri.AbsoluteUri, It.IsAny <string>())).Returns(true);
            _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(_rootUri.AbsoluteUri + "aaaaa", It.IsAny <string>())).Returns(false);
            _fakeRobotsDotTextFinder.Setup(f => f.FindAsync(It.IsAny <Uri>())).Returns(Task.FromResult(_fakeRobotsDotText.Object));
            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page1));
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _dummyConfiguration.IsRespectRobotsDotTextEnabled = true;
            _dummyConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled = true;
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            await _unitUnderTest.CrawlAsync(_rootUri);

            _fakeCrawlDecisionMaker.VerifyAll();
            _fakeRobotsDotText.VerifyAll();
            _fakeRobotsDotTextFinder.VerifyAll();
            _fakeHttpRequester.VerifyAll();
        }
        public async Task Crawl_MaxPagesTo25_OnlyCrawls25Pages()
        {
            await new PageRequester(new CrawlConfiguration {
                UserAgentString = "aaa"
            }).MakeRequestAsync(new Uri("http://localhost:1111/PageGenerator/ClearCounters"));

            CrawlConfiguration configuration = new CrawlConfiguration();

            configuration.MaxPagesToCrawl = 25;
            configuration.IsExternalPageCrawlingEnabled      = true;
            configuration.IsExternalPageLinksCrawlingEnabled = true;

            int pagesCrawledCount = 0;

            PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null);

            crawler.PageCrawlCompleted += (a, b) =>
            {
                pagesCrawledCount++;
            };

            var res = await crawler.CrawlAsync(new Uri("http://localhost:1111/"));

            Assert.AreEqual(25, pagesCrawledCount);
        }
Example #3
0
        public async Task ExtractRecipesAsync(Uri recipeWebsiteUri, CancellationToken cancellationToken = default)
        {
            var crawlResult = await _politeWebCrawler.CrawlAsync(recipeWebsiteUri);

            //TODO: Error handling
            //TODO: Logging
        }
        public async Task Crawl_IsRateLimited()
        {
            await new PageRequester(new CrawlConfiguration {
                UserAgentString = "aaa"
            }).MakeRequestAsync(new Uri("http://localhost:1111/PageGenerator/ClearCounters"));

            CrawlConfiguration configuration = new CrawlConfiguration();

            configuration.MaxPagesToCrawl = 3;
            configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; // 1 second * 2 pages = 2 (or more) seconds

            int pagesCrawledCount = 0;

            var crawler = new PoliteWebCrawler(configuration);

            crawler.PageCrawlCompleted += (a, b) => pagesCrawledCount++;

            var uriToCrawl = new Uri("http://localhost:1111/");
            var start      = DateTime.Now;
            await crawler.CrawlAsync(uriToCrawl);

            var elapsed = DateTime.Now - start;

            Assert.GreaterOrEqual(elapsed.TotalMilliseconds, 2000);
            Assert.AreEqual(3, pagesCrawledCount);
        }
Example #5
0
        public async Task Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_PageIsDisallowed_DoesNotCallHttpRequester()
        {
            var homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent
                {
                    Text = "content here"
                }
            };

            _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(0);
            _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(false);
            _fakeRobotsDotTextFinder.Setup(f => f.FindAsync(It.IsAny <Uri>())).Returns(Task.FromResult(_fakeRobotsDotText.Object));

            _dummyConfiguration.IsRespectRobotsDotTextEnabled = true;
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHtmlParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            await _unitUnderTest.CrawlAsync(_rootUri);

            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(It.IsAny <Uri>(), It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(homePage));
            _fakeRobotsDotText.VerifyAll();
            _fakeRobotsDotTextFinder.VerifyAll();
            _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny <Uri>(), It.IsAny <long>()), Times.Exactly(0));
            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(0));
        }
Example #6
0
        public void Test(Uri uri)
        {
            pageCount = 0;
            baseUri   = uri;
            string message;

            CrawlConfiguration crawlConfiguration = new CrawlConfiguration();

            crawlConfiguration.MaxConcurrentThreads = 4;
            crawlConfiguration.UserAgentString      =
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
                "AppleWebKit/537.36 (KHTML, like Gecko) " +
                "Chrome/60.0.3112.113 Safari/537.36 bot";
            crawlConfiguration.MaxPagesToCrawl          = 10000;
            crawlConfiguration.DownloadableContentTypes =
                "text/html, text/plain, image/jpeg, image/pjpeg, image/png";
            crawlConfiguration.CrawlTimeoutSeconds = 100;
            crawlConfiguration.MinCrawlDelayPerDomainMilliSeconds = 1000;

            using PoliteWebCrawler crawler =
                      new PoliteWebCrawler(crawlConfiguration);

            crawler.PageCrawlStarting  += ProcessPageCrawlStarted;
            crawler.PageCrawlCompleted += ProcessPageCrawlCompleted;

            CrawlResult result = crawler.CrawlAsync(baseUri).Result;

            if (result.ErrorOccurred)
            {
                message = StringTable.GetString(
                    "CRAWL_COMPLETE_ERROR",
                    CultureInfo.InstalledUICulture);

                Log.InfoFormat(
                    CultureInfo.InvariantCulture,
                    message,
                    result.RootUri.AbsoluteUri,
                    result.ErrorException.Message);
            }
            else
            {
                message = StringTable.GetString(
                    "CRAWL_COMPLETE_NO_ERROR",
                    CultureInfo.InstalledUICulture);

                Log.InfoFormat(
                    CultureInfo.InvariantCulture,
                    message,
                    result.RootUri.AbsoluteUri);
            }

            message = StringTable.GetString(
                "TOTAL_PAGES",
                CultureInfo.InstalledUICulture);
            Log.InfoFormat(
                CultureInfo.InvariantCulture,
                message,
                pageCount.ToString(CultureInfo.InvariantCulture));
        }
Example #7
0
        private static async Task DemoSimpleCrawler()
        {
            var config = new CrawlConfiguration
            {
                UserAgentString = "2019RLCrawlAThon",
                MaxPagesToCrawl = 0,
                MinCrawlDelayPerDomainMilliSeconds = 10,
            };
            var start   = new Uri("https://thailand.kyocera.com/");
            var crawler = new PoliteWebCrawler(
                config,
                new BetterDecisionMaker(start),
                null,
                new Scheduler(false, null, new PriorityUriRepository()),
                null,
                null,
                null,
                null,
                null);

            var files    = new HashSet <string>();
            var decMaker = new CrawlDecisionMaker();
            var batch    = new HashSet <string>();

            crawler.PageCrawlCompleted += Crawler_PageCrawlCompleted;
            crawler.PageCrawlCompleted += (sender, e) =>
            {
                if (new[] { ".exe", ".zip", ".tar" }.Any(c => e.CrawledPage.Uri.AbsolutePath.Contains(c)))
                {
                    lock (files)
                    {
                        Console.WriteLine("Found file: " + e.CrawledPage.Uri.Host + e.CrawledPage.Uri.LocalPath);
                        Console.WriteLine(e.CrawledPage.CrawlDepth);
                        if (!files.Contains(e.CrawledPage.Uri.ToString()))
                        {
                            files.Add(e.CrawledPage.Uri.ToString());
                            batch.Add(e.CrawledPage.Uri.ToString());
                            if (batch.Count >= 10)
                            {
                                using (var httpClient = new HttpClient())
                                {
                                    using (var request = new HttpRequestMessage(new HttpMethod("POST"), "http://hackathon.reversinglabs.com/api/test/bulk"))
                                    {
                                        var base64authorization = Convert.ToBase64String(Encoding.ASCII.GetBytes("tztok_jadnici:7@dQ6dqq7YZggcd"));
                                        request.Headers.TryAddWithoutValidation("Authorization", $"Basic {base64authorization}");

                                        var body = "{\"crawlathon\": {\"query\": {\"site\": \"filehippo\", \"links\":[" + string.Join(", ", batch.Select(s => "\"" + s + "\"")) + "]}}}";
                                        request.Content = new StringContent(body, Encoding.UTF8, "application/json");
                                        var resp = httpClient.SendAsync(request).Result;
                                        batch.Clear();
                                    }
                                }
                            }
                        }
                    }
                }
            };
            var crawlResult = await crawler.CrawlAsync(start);
        }
Example #8
0
        public async Task Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_CrawlDelayAboveMinDomainCrawlDelay_CallsDomainRateLimiter()
        {
            var uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            var uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            var homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent
                {
                    Text = "content here"
                }
            };
            var page1 = new CrawledPage(uri1);
            var page2 = new CrawledPage(uri2);

            var links = new List <HyperLink>
            {
                new HyperLink()
                {
                    HrefValue = uri1
                },
                new HyperLink()
                {
                    HrefValue = uri2
                }
            };

            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(homePage));
            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page1));
            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page2));
            _fakeHtmlParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldRecrawlPage(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = false
            });

            _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(3);//this is more then the max configured crawl delay (should be ignored)
            _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(true);
            _fakeRobotsDotTextFinder.Setup(f => f.FindAsync(It.IsAny <Uri>())).Returns(Task.FromResult(_fakeRobotsDotText.Object));

            _dummyConfiguration.IsRespectRobotsDotTextEnabled       = true; //BY HAVING A THIS EQUAL TO TRUE WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
            _dummyConfiguration.MaxRobotsDotTextCrawlDelayInSeconds = 2;    //This is less than the crawl delay (Should Be used)
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHtmlParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            await _unitUnderTest.CrawlAsync(_rootUri);

            _fakeHttpRequester.VerifyAll();
            _fakeHtmlParser.VerifyAll();
            _fakeRobotsDotText.VerifyAll();
            _fakeRobotsDotTextFinder.VerifyAll();
            _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny <Uri>(), 2000), Times.Exactly(1));
            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
        }
Example #9
0
        public async Task Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_ZeroCrawlDelay_StillCallsDomainRateLimiter()
        {
            var uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            var uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            var homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent
                {
                    Text = "content here"
                }
            };
            var page1 = new CrawledPage(uri1);
            var page2 = new CrawledPage(uri2);

            var links = new List <HyperLink>
            {
                new HyperLink()
                {
                    HrefValue = uri1
                },
                new HyperLink()
                {
                    HrefValue = uri2
                }
            };

            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(homePage));
            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page1));
            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page2));
            _fakeHtmlParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldRecrawlPage(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = false
            });

            _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(0);
            _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(true);
            _fakeRobotsDotTextFinder.Setup(f => f.FindAsync(It.IsAny <Uri>())).Returns(Task.FromResult(_fakeRobotsDotText.Object));

            _dummyConfiguration.IsRespectRobotsDotTextEnabled = true;
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHtmlParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            await _unitUnderTest.CrawlAsync(_rootUri);

            _fakeHttpRequester.VerifyAll();
            _fakeHtmlParser.VerifyAll();
            _fakeRobotsDotText.VerifyAll();
            _fakeRobotsDotTextFinder.VerifyAll();
            _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny <Uri>(), It.IsAny <long>()), Times.Exactly(0));
            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(3));
        }
Example #10
0
        public async Task GetDoc()
        {
            var config = new CrawlConfiguration
            {
                MaxPagesToCrawl = 20,
            };
            var crawler = new PoliteWebCrawler(config);

            crawler.PageCrawlCompleted += Crawler_PageCrawlCompleted;

            var crawlResult = await crawler.CrawlAsync(new Uri("https://plantuml.com/"));
        }
Example #11
0
        private static async Task DemoSimpleCrawler()
        {
            var config = new CrawlConfiguration
            {
                MaxPagesToCrawl = 10,                     //Only crawl 10 pages
                MinCrawlDelayPerDomainMilliSeconds = 3000 //Wait this many millisecs between requests
            };
            var crawler = new PoliteWebCrawler(config);

            crawler.PageCrawlCompleted += PageCrawlCompleted;//Several events available...

            var crawlResult = await crawler.CrawlAsync(new Uri("https://www.onet.pl"));
        }
Example #12
0
        private static async Task DemoSimpleCrawler()
        {
            var config = new CrawlConfiguration
            {
                MaxPagesToCrawl = 25,
                MinCrawlDelayPerDomainMilliSeconds = 3000
            };
            var crawler = new PoliteWebCrawler(config);

            crawler.PageCrawlCompleted += Crawler_PageCrawlCompleted;

            var crawlResult = await crawler.CrawlAsync(new Uri("http://wvtesting2.com"));
        }
Example #13
0
        public async Task DownloadAllArticles()
        {
            var config = new CrawlConfiguration
            {
                MaxPagesToCrawl = 300,
                MinCrawlDelayPerDomainMilliSeconds = 300
            };
            var crawler = new PoliteWebCrawler(config);

            crawler.PageCrawlCompleted += PageCrawlCompleted;
            crawler.PageCrawlCompleted += Crawler_ProcessPageCrawlCompleted;

            var crawlResult = await crawler.CrawlAsync(new Uri(siteUrl));
        }
Example #14
0
        /// <summary>
        /// Initiate crawling with
        /// List of urls to be crawled
        /// Search text to be found
        /// configure crawling
        /// </summary>
        /// <param name="UrlsList"></param>
        /// <param name="SearchText"></param>
        /// <returns></returns>
        private static async Task SimpleCrawler(List <string> UrlsList, string SearchText)
        {
            var ListPagesUrs = new List <string>();
            int TotalNumer   = 0;

            //loop through a list
            foreach (var el in UrlsList)
            {
                var config = new CrawlConfiguration
                {
                    MaxPagesToCrawl = 10,                     //Only crawl 50 pages
                    MinCrawlDelayPerDomainMilliSeconds = 3000 //Wait this many millisecs between requests
                };

                // use polite crawler
                var crawler = new PoliteWebCrawler(config);

                crawler.PageCrawlStarting        += crawler_ProcessPageCrawlStarting;
                crawler.PageCrawlCompleted       += PageCrawlCompleted;//Several events available...
                crawler.PageCrawlDisallowed      += crawler_PageCrawlDisallowed;
                crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed;

                //Setup Custom class to store data
                crawler.CrawlBag.CrawlExtension = new CrawlExtension()
                {
                    SearchWord = SearchText,
                    ListUrls   = new System.Collections.Generic.List <string>()
                };

                var test = new List <string>();

                var crawlResult = await crawler.CrawlAsync(new Uri(el));


                List <string> valueList = crawler.CrawlBag.CrawlExtension.ListUrls;

                // add  urls found to list
                ListPagesUrs.AddRange(valueList);
                TotalNumer = TotalNumer + crawler.CrawlBag.CrawlExtension.NumberSearchFound;//increment values found
            }


            Console.WriteLine("==== Total number of sites with word [{0}] is {1}", SearchText, TotalNumer);
            Console.WriteLine("==== Site lists with word {0} =====\r", SearchText);
            foreach (var el in ListPagesUrs)
            {
                Console.WriteLine(el);
            }
        }
Example #15
0
        private static async Task DemoSimpleCrawler()
        {
            var config = new CrawlConfiguration
            {
                MaxPagesToCrawl = 25,                     //Crawl 25 pages
                MinCrawlDelayPerDomainMilliSeconds = 3000 // Wait 3 seconds between requests
            };

            var crawler     = new PoliteWebCrawler(config);
            var siteToCrawl = "http://wiprodigital.com";

            crawler.PageCrawlCompleted += PageCrawlCompleted;

            var crawlResult = await crawler.CrawlAsync(new Uri(siteToCrawl));
        }
Example #16
0
        private static async Task RecipeCrawler(string url)
        {
            var config = new CrawlConfiguration
            {
                MaxPagesToCrawl                    = 100,
                MaxConcurrentThreads               = 2,
                IsUriRecrawlingEnabled             = false,
                MinCrawlDelayPerDomainMilliSeconds = 3000,
                MaxCrawlDepth = 1
            };
            var crawler = new PoliteWebCrawler(config);

            crawler.PageCrawlCompleted += PageCrawlCompleted;//Several events available...
            var crawlResult = await crawler.CrawlAsync(new Uri(url));
        }
Example #17
0
        public async Task Crawl_MinCrawlDelayGreaterThanZero_CallsDomainRateLimiter()
        {
            var uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            var uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            var homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent
                {
                    Text = "content here"
                }
            };
            var page1 = new CrawledPage(uri1);
            var page2 = new CrawledPage(uri2);

            var links = new List <HyperLink>
            {
                new HyperLink()
                {
                    HrefValue = uri1
                },
                new HyperLink()
                {
                    HrefValue = uri2
                }
            };

            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(homePage));
            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page1));
            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page2));
            _fakeHtmlParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldRecrawlPage(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = false
            });

            _dummyConfiguration.MinCrawlDelayPerDomainMilliSeconds = 1;//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHtmlParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            await _unitUnderTest.CrawlAsync(_rootUri);

            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
        }
Example #18
0
        public async Task Crawl_MaxPagesTo25_OnlyCrawls25Pages()
        {
            var configuration = new CrawlConfiguration();

            configuration.MaxPagesToCrawl = 25;

            var pagesCrawledCount = 0;

            var crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null);

            crawler.PageCrawlCompleted += (a, b) => pagesCrawledCount++;

            await crawler.CrawlAsync(new Uri("http://localhost.fiddler:1111/"));

            Assert.AreEqual(25, pagesCrawledCount);
        }
Example #19
0
        public async Task Crawl_MaxPagesTo5_WithCrawlDelay_OnlyCrawls5Pages()
        {
            var configuration = new CrawlConfiguration();

            configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; //adding delay since it increases the chance of issues with abot crawling more than MaxPagesToCrawl.
            configuration.MaxPagesToCrawl = 5;

            var pagesCrawledCount = 0;

            var crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null);

            crawler.PageCrawlCompleted += (a, b) => pagesCrawledCount++;

            await crawler.CrawlAsync(new Uri("http://localhost.fiddler:1111/"));

            Assert.AreEqual(5, pagesCrawledCount);
        }
Example #20
0
        private static async Task DemoSimpleCrawler()
        {
            var config = new CrawlConfiguration
            {
                MaxPagesToCrawl = 100,                         //Only crawl 100 pages
                MinCrawlDelayPerDomainMilliSeconds = 3000,     // Wait this many milliseconds between requests
                LoginUser              = "******", //
                LoginPassword          = "******",
                IsUriRecrawlingEnabled = true,
                UseDefaultCredentials  = true
            };

            var crawler = new PoliteWebCrawler(config);

            crawler.PageCrawlCompleted += PageCrawlCompleted;   // Several events available

            var crawlResult = await crawler.CrawlAsync(new Uri("https://www.udemy.com/"));
        }
Example #21
0
        static async Task Main(string[] args)
        {
            var config = new CrawlConfiguration
            {
                MaxPagesToCrawl             = 100_000,
                HttpRequestTimeoutInSeconds = 60,
            };
            var crawler = new PoliteWebCrawler(config, new KwestiaSmakuCrawlDecisionMaker(), null, null, null, null, null, null, null);

            crawler.PageCrawlCompleted += PageCrawlCompleted;
            var crawlResult = await crawler.CrawlAsync(new Uri("https://www.kwestiasmaku.com/"));

            await using (var file = File.OpenWrite("result.txt"))
            {
                var jsonResult = JsonConvert.SerializeObject(_recipeLinks);
                await file.WriteAsync(Encoding.UTF8.GetBytes(jsonResult));
            }
        }
Example #22
0
        public async Task Crawl_CrawlTimeoutIs1Sec_TimesOut()
        {
            var configuration = new CrawlConfiguration();

            configuration.CrawlTimeoutSeconds = 2;

            var pagesCrawledCount = 0;

            var crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null);

            crawler.PageCrawlCompleted += (a, b) => pagesCrawledCount++;

            var result = await crawler.CrawlAsync(new Uri("http://localhost.fiddler:1111/"));

            Assert.IsFalse(result.ErrorOccurred);
            Assert.IsTrue(result.Elapsed.TotalSeconds < 8, "Took more than 8 seconds");
            Assert.IsTrue(pagesCrawledCount < 2, "Crawled more than 2 pages");
        }
Example #23
0
        public async Task Crawl_MaxPagesTo5_OnlyCrawls5Pages()
        {
            var configuration = new CrawlConfiguration
            {
                IsExternalPageCrawlingEnabled = true,
                MaxPagesToCrawl = 5
            };

            var pagesCrawledCount = 0;

            var crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null);

            crawler.PageCrawlCompleted += (a, b) => pagesCrawledCount++;

            await crawler.CrawlAsync(new Uri("http://localhost.fiddler:1111/"));

            Assert.AreEqual(5, pagesCrawledCount);
        }
        public async Task Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_UsesCorrectUserAgentString()
        {
            Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            CrawledPage homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent
                {
                    Text = "content here"
                }
            };
            CrawledPage page1 = new CrawledPage(uri1);
            CrawledPage page2 = new CrawledPage(uri2);

            List <Uri> links = new List <Uri> {
                uri1, uri2
            };

            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(homePage));
            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page1));
            _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page2));
            _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });

            _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(0);
            _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(true);
            _fakeRobotsDotTextFinder.Setup(f => f.FindAsync(It.IsAny <Uri>())).Returns(Task.FromResult(_fakeRobotsDotText.Object));

            _dummyConfiguration.IsRespectRobotsDotTextEnabled = true;
            _dummyConfiguration.RobotsDotTextUserAgentString  = "abcd";
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            await _unitUnderTest.CrawlAsync(_rootUri);

            _fakeRobotsDotText.Verify(f => f.GetCrawlDelay(_dummyConfiguration.RobotsDotTextUserAgentString), Times.Exactly(1));
            _fakeRobotsDotText.Verify(f => f.IsUrlAllowed(uri1.AbsoluteUri, _dummyConfiguration.RobotsDotTextUserAgentString), Times.Exactly(1));
            _fakeRobotsDotText.Verify(f => f.IsUrlAllowed(uri1.AbsoluteUri, _dummyConfiguration.RobotsDotTextUserAgentString), Times.Exactly(1));
        }
Example #25
0
        private static async Task DemoSimpleCrawler <T>(IParserSettings parserSettings, IParser <T> parser) where T : class
        {
            var config = new CrawlConfiguration
            {
                MaxPagesToCrawl = 20,                      //Only crawl 50 pages
                MinCrawlDelayPerDomainMilliSeconds = 1000, //Wait this many millisecs between requests
            };
            var crawler = new PoliteWebCrawler(config);

            crawler.PageCrawlCompleted += PageCrawlCompleted; // event

            //crawler.ShouldCrawlPageDecisionMaker = CrawlPage; // delegate

            crawler.CrawlBag.Parser   = parser;
            crawler.CrawlBag.Settings = parserSettings;


            var crawlResult = await crawler.CrawlAsync(new Uri(parserSettings.BaseUrl));
        }
Example #26
0
        public async Task Crawl_Synchronous_CancellationTokenCancelled_StopsCrawl()
        {
            var cancellationTokenSource = new CancellationTokenSource();
            var timer = new System.Timers.Timer(800);

            timer.Elapsed += (o, e) =>
            {
                cancellationTokenSource.Cancel();
                timer.Stop();
                timer.Dispose();
            };
            timer.Start();

            var crawler = new PoliteWebCrawler();
            var result  = await crawler.CrawlAsync(new Uri("http://localhost.fiddler:1111/"), cancellationTokenSource);

            Assert.IsTrue(result.ErrorOccurred);
            Assert.IsTrue(result.ErrorException is OperationCanceledException);
        }
Example #27
0
        private static async Task DoCrawl(Smithy smithy)
        {
            var config = new CrawlConfiguration();

            if (!string.IsNullOrEmpty(smithy.User) && !string.IsNullOrEmpty(smithy.Pass))
            {
                config.MaxConcurrentThreads = smithy.Threads;
                config.MaxCrawlDepth        = smithy.Depth;
                config.MinCrawlDelayPerDomainMilliSeconds = smithy.Delay;
                config.MaxPagesToCrawl = smithy.MaxPages;
                config.MaxRetryCount   = 1;
                //HttpServicePointConnectionLimit = 2000,
                config.HttpRequestTimeoutInSeconds = smithy.Timeout;
                config.LoginUser     = smithy.User;
                config.LoginPassword = smithy.Pass;
            }

            if (!string.IsNullOrEmpty(smithy.User) || !string.IsNullOrEmpty(smithy.Pass))
            {
                if (string.IsNullOrEmpty(smithy.Pass) || string.IsNullOrEmpty(smithy.User))
                {
                    Console.WriteLine("Please specify both a username and a password if using basic auth");
                    System.Environment.Exit(1);
                }
            }

            else
            {
                config.MaxConcurrentThreads = smithy.Threads;
                config.MaxCrawlDepth        = smithy.Depth;
                config.MinCrawlDelayPerDomainMilliSeconds = smithy.Delay;
                config.MaxPagesToCrawl = smithy.MaxPages;
                config.MaxRetryCount   = 1;
                //HttpServicePointConnectionLimit = 2000,
                config.HttpRequestTimeoutInSeconds = smithy.Timeout;
            }

            var crawler = new PoliteWebCrawler(config);

            crawler.PageCrawlCompleted += PageCrawlCompleted;

            var crawlResult = await crawler.CrawlAsync(new Uri(smithy.Url));
        }
Example #28
0
        public async Task <List <CrawlerPage> > CollectLinks(string URL, IEnumerable <ICrawlerFilter> IncludeFilters = null, IEnumerable <ICrawlerFilter> ExcludeFilters = null)
        {
            var result  = new List <CrawlerPage>();
            var crawler = new PoliteWebCrawler(AbotProvider.DefaultConfig);

            var useIncludeFilters = IncludeFilters != null && IncludeFilters.Count() > 0;
            var useExcludeFilters = ExcludeFilters != null && ExcludeFilters.Count() > 0;

            crawler.PageCrawlCompleted += (s, e) => {
                var currentURL = e.CrawledPage.Uri.GetLeftPart(UriPartial.Path); // remove query-params
                Debug.WriteLine($"Page {e.CrawledPage.Uri}, Depth: {e.CrawledPage.CrawlDepth} -> {e.CrawledPage.ParsedLinks?.Count()} Links");

                var match = IncludeFilters?.FirstOrDefault(f => f.Execute(currentURL));
                if (useIncludeFilters && match == null)
                {
                    Debug.WriteLine("skipped by Include filter");
                    return;
                }

                match = ExcludeFilters?.FirstOrDefault(f => f.Execute(currentURL));
                if (useExcludeFilters && match != null)
                {
                    Debug.WriteLine("skipped by Exclude filter");
                    return;
                }

                result.Add(new CrawlerPage
                {
                    URL         = currentURL,
                    ContentType = (match is CrawlerVideoFilter) ? eCrawlerContentType.VIDEO : eCrawlerContentType.CONTENT,
                    Depth       = e.CrawledPage.CrawlDepth,
                    Links       = e.CrawledPage.ParsedLinks?.Select(x => x.HrefValue.ToString()).ToList()
                });
            };

            var crawlSummary = await crawler.CrawlAsync(new Uri(URL));

            Debug.WriteLine($"[Completed] {crawlSummary.CrawlContext.CrawledCount}");

            return(result);
        }
        public async Task Crawl_MaxPagesTo5_WithCrawlDelay_OnlyCrawls5Pages()
        {
            await new PageRequester(new CrawlConfiguration {
                UserAgentString = "aaa"
            }).MakeRequestAsync(new Uri("http://localhost:1111/PageGenerator/ClearCounters"));

            CrawlConfiguration configuration = new CrawlConfiguration();

            configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; //adding delay since it increases the chance of issues with abot crawling more than MaxPagesToCrawl.
            configuration.MaxPagesToCrawl = 5;

            int pagesCrawledCount = 0;

            PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null);

            crawler.PageCrawlCompleted += (a, b) => pagesCrawledCount++;

            await crawler.CrawlAsync(new Uri("http://localhost:1111/"));

            Assert.AreEqual(5, pagesCrawledCount);
        }
Example #30
0
        public async Task Crawl_IsRateLimited()
        {
            var configuration = new CrawlConfiguration();

            configuration.MaxPagesToCrawl = 3;
            configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; // 1 second * 2 pages = 2 (or more) seconds

            var pagesCrawledCount = 0;

            var crawler = new PoliteWebCrawler(configuration);

            crawler.PageCrawlCompleted += (a, b) => pagesCrawledCount++;

            var uriToCrawl = new Uri("http://localhost.fiddler:1111/");
            var start      = DateTime.Now;
            await crawler.CrawlAsync(uriToCrawl);

            var elapsed = DateTime.Now - start;

            Assert.IsTrue(elapsed.TotalMilliseconds > 2000);
            Assert.AreEqual(3, pagesCrawledCount);
        }