Exemplo n.º 1
0
        private CrawlDecision ShouldCrawlPageContent(CrawledPage page, CrawlContext context)
        {
            var result = new CrawlDecision();

            if (page.Uri.ToString().Contains("product") ||
                //page.Uri.ToString().Contains("lenovo") ||
                //page.Uri.ToString().Contains("laptop") ||
                page.Uri.ToString().Contains("productVariantGroup") ||
                page.Uri.ToString().Contains("-pc"))
            {
                result.Allow = true;
                if (page.Uri.ToString().Contains("-pch"))
                {
                    result.Reason = "Not a product";
                    result.Allow  = false;
                }
            }
            else
            {
                result.Reason = "Not a product";
                result.Allow  = false;
            }

            return(result);
        }
Exemplo n.º 2
0
        public WebSpider()
        {
            _crawler = new PoliteWebCrawler();

            _crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            _crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            _crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            _crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                CrawlDecision decision = new CrawlDecision {
                    Allow = true
                };

                var isCrawlDepth1 = pageToCrawl.CrawlDepth == 0 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/s?wd");
                var isCrawlDepth2 = pageToCrawl.CrawlDepth == 1 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/link");

                if (isCrawlDepth1 || isCrawlDepth2)
                {
                    return new CrawlDecision {
                        Allow = false, Reason = "Dont want to crawl google pages"
                    }
                }
                ;

                return(decision);
            });
        }
Exemplo n.º 3
0
        protected virtual bool ShouldCrawlPage(PageToCrawl pageToCrawl)
        {
            if (_maxPagesToCrawlLimitReachedOrScheduled)
            {
                return(false);
            }

            CrawlDecision shouldCrawlPageDecision = _crawlDecisionMaker.ShouldCrawlPage(pageToCrawl, _crawlContext);

            if (!shouldCrawlPageDecision.Allow &&
                shouldCrawlPageDecision.Reason.Contains("MaxPagesToCrawl limit of"))
            {
                _maxPagesToCrawlLimitReachedOrScheduled = true;
                _logger.Info("MaxPagesToCrawlLimit has been reached or scheduled. No more pages will be scheduled.");
                return(false);
            }

            if (shouldCrawlPageDecision.Allow)
            {
                shouldCrawlPageDecision = (_shouldCrawlPageDecisionMaker != null) ? _shouldCrawlPageDecisionMaker.Invoke(pageToCrawl, _crawlContext) : new CrawlDecision {
                    Allow = true
                }
            }
            ;

            if (!shouldCrawlPageDecision.Allow)
            {
                _logger.DebugFormat("Page [{0}] not crawled, [{1}]", pageToCrawl.Uri.AbsoluteUri, shouldCrawlPageDecision.Reason);
                FirePageCrawlDisallowedEventAsync(pageToCrawl, shouldCrawlPageDecision.Reason);
                FirePageCrawlDisallowedEvent(pageToCrawl, shouldCrawlPageDecision.Reason);
            }

            SignalCrawlStopIfNeeded(shouldCrawlPageDecision);
            return(shouldCrawlPageDecision.Allow);
        }
Exemplo n.º 4
0
        public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
        {
            if (pageToCrawl == null)
            {
                return(CrawlDecision.DisallowCrawl("Null crawled page"));
            }

            if (crawlContext == null)
            {
                return(CrawlDecision.DisallowCrawl("Null crawl context"));
            }

            if (pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth)
            {
                return(CrawlDecision.DisallowCrawl("Crawl depth is above max"));
            }

            if (!pageToCrawl.Uri.Scheme.StartsWith("http"))
            {
                return(CrawlDecision.DisallowCrawl("Scheme does not begin with http"));
            }

            //TODO Do we want to ignore redirect chains (ie.. do not treat them as seperate page crawls)?
            if (!pageToCrawl.IsRetry &&
                crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 &&
                crawlContext.CrawledCount > crawlContext.CrawlConfiguration.MaxPagesToCrawl)
            {
                return(CrawlDecision.DisallowCrawl(string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl)));
            }

            return(CrawlDecision.AllowCrawl());
        }
Exemplo n.º 5
0
        public void ShouldCrawlPage_NullCrawlContext_ReturnsFalse()
        {
            CrawlDecision result = _unitUnderTest.ShouldCrawlPage(new PageToCrawl(new Uri("http://a.com/")), null);

            Assert.IsFalse(result.Allow);
            Assert.AreEqual("Null crawl context", result.Reason);
        }
        public void ShouldCrawlPageLinks_IsExternalPageLinksCrawlingEnabledFalse_InternalLink_ReturnsTrue()
        {
            CrawlDecision result = _unitUnderTest.ShouldCrawlPageLinks(
                new CrawledPage(new Uri("http://b.com/a.html"))
            {
                Content = new PageContent
                {
                    Text = "aaaa"
                },
                IsInternal = true
            },
                new CrawlContext
            {
                RootUri            = new Uri("http://a.com/ "),
                CrawlConfiguration = new CrawlConfiguration
                {
                    IsExternalPageLinksCrawlingEnabled = false
                }
            });

            Assert.AreEqual(true, result.Allow);
            Assert.AreEqual("", result.Reason);
            Assert.IsFalse(result.ShouldHardStopCrawl);
            Assert.IsFalse(result.ShouldStopCrawl);
        }
Exemplo n.º 7
0
        public virtual CrawlDecision ShouldRecrawlPage(CrawledPage crawledPage, CrawlContext crawlContext)
        {
            if (crawledPage == null)
            {
                return(CrawlDecision.DisallowCrawl("Null crawled page"));
            }

            if (crawlContext == null)
            {
                return(CrawlDecision.DisallowCrawl("Null crawl context"));
            }

            if (crawledPage.Exception == null)
            {
                return(CrawlDecision.DisallowCrawl("WebException did not occur"));
            }

            if (crawlContext.CrawlConfiguration.MaxRetryCount < 1)
            {
                return(CrawlDecision.AllowCrawl("无限次重试"));
            }

            if (crawledPage.RetryCount >= crawlContext.CrawlConfiguration.MaxRetryCount)
            {
                return(CrawlDecision.DisallowCrawl("MaxRetryCount has been reached"));
            }

            return(CrawlDecision.AllowCrawl());
        }
        public void ShouldCrawlPage_OverMaxPagesToCrawlPerDomain_IsRetry_ReturnsTrue()
        {
            Uri uri = new Uri("http://a.com/");
            CrawlConfiguration config = new CrawlConfiguration
            {
                MaxPagesToCrawlPerDomain = 100
            };
            ConcurrentDictionary <string, int> countByDomain = new ConcurrentDictionary <string, int>();

            countByDomain.TryAdd(uri.Authority, 100);
            CrawlContext crawlContext = new CrawlContext
            {
                CrawlConfiguration = config,
                CrawlStartDate     = DateTime.Now,
                CrawlCountByDomain = countByDomain
            };

            CrawlDecision result = _unitUnderTest.ShouldCrawlPage(
                new PageToCrawl(new Uri(uri.AbsoluteUri + "anotherpage"))
            {
                IsRetry    = true,
                IsInternal = true
            },
                crawlContext);

            Assert.IsTrue(result.Allow);
            Assert.IsFalse(result.ShouldHardStopCrawl);
            Assert.IsFalse(result.ShouldStopCrawl);
        }
Exemplo n.º 9
0
        protected virtual bool ShouldCrawlPage(PageToCrawl pageToCrawl)
        {
            CrawlDecision shouldCrawlPageDecision = _crawlDecisionMaker.ShouldCrawlPage(pageToCrawl, _crawlContext);

            if (shouldCrawlPageDecision.Allow)
            {
                shouldCrawlPageDecision = (_shouldCrawlPageDecisionMaker != null) ? _shouldCrawlPageDecisionMaker.Invoke(pageToCrawl, _crawlContext) : new CrawlDecision {
                    Allow = true
                }
            }
            ;

            if (shouldCrawlPageDecision.Allow)
            {
                AddPageToContext(pageToCrawl);
            }
            else
            {
                _logger.DebugFormat("Page [{0}] not crawled, [{1}]", pageToCrawl.Uri.AbsoluteUri, shouldCrawlPageDecision.Reason);
                FirePageCrawlDisallowedEventAsync(pageToCrawl, shouldCrawlPageDecision.Reason);
                FirePageCrawlDisallowedEvent(pageToCrawl, shouldCrawlPageDecision.Reason);
            }

            SignalCrawlStopIfNeeded(shouldCrawlPageDecision);
            return(shouldCrawlPageDecision.Allow);
        }
Exemplo n.º 10
0
        public void ShouldCrawlPage_NonHttpOrHttpsSchemes_ReturnsFalse()
        {
            CrawlDecision result = _unitUnderTest.ShouldCrawlPage(new PageToCrawl(new Uri("file:///C:/Users/")), _crawlContext);

            Assert.IsFalse(result.Allow);
            Assert.AreEqual("Scheme does not begin with http", result.Reason);
            Assert.IsFalse(result.ShouldHardStopCrawl);
            Assert.IsFalse(result.ShouldStopCrawl);

            result = _unitUnderTest.ShouldCrawlPage(new PageToCrawl(new Uri("mailto:[email protected]")), _crawlContext);
            Assert.IsFalse(result.Allow);
            Assert.AreEqual("Scheme does not begin with http", result.Reason);
            Assert.IsFalse(result.ShouldHardStopCrawl);
            Assert.IsFalse(result.ShouldStopCrawl);

            result = _unitUnderTest.ShouldCrawlPage(new PageToCrawl(new Uri("ftp://[email protected]")), _crawlContext);
            Assert.IsFalse(result.Allow);
            Assert.AreEqual("Scheme does not begin with http", result.Reason);
            Assert.IsFalse(result.ShouldHardStopCrawl);
            Assert.IsFalse(result.ShouldStopCrawl);

            result = _unitUnderTest.ShouldCrawlPage(new PageToCrawl(new Uri("callto:+1234567")), _crawlContext);
            Assert.IsFalse(result.Allow);
            Assert.AreEqual("Scheme does not begin with http", result.Reason);
            Assert.IsFalse(result.ShouldHardStopCrawl);
            Assert.IsFalse(result.ShouldStopCrawl);

            result = _unitUnderTest.ShouldCrawlPage(new PageToCrawl(new Uri("tel:+1234567")), _crawlContext);
            Assert.IsFalse(result.Allow);
            Assert.AreEqual("Scheme does not begin with http", result.Reason);
            Assert.IsFalse(result.ShouldHardStopCrawl);
            Assert.IsFalse(result.ShouldStopCrawl);
        }
Exemplo n.º 11
0
        public void ShouldDownloadPageContent_NullCrawledPage_ReturnsFalse()
        {
            CrawlDecision result = _unitUnderTest.ShouldDownloadPageContent(null, new CrawlContext());

            Assert.AreEqual(false, result.Allow);
            Assert.AreEqual("Null crawled page", result.Reason);
        }
Exemplo n.º 12
0
        public void ShouldDownloadPageContent_NullCrawlContext_ReturnsFalse()
        {
            CrawlDecision result = _unitUnderTest.ShouldDownloadPageContent(new CrawledPage(new Uri("http://a.com/a.html")), null);

            Assert.IsFalse(result.Allow);
            Assert.AreEqual("Null crawl context", result.Reason);
        }
Exemplo n.º 13
0
        public void Constructor_ValidUri_CreatesInstance()
        {
            CrawlDecision unitUnderTest = new CrawlDecision();

            Assert.AreEqual(false, unitUnderTest.Allow);
            Assert.AreEqual("", unitUnderTest.Reason);
        }
Exemplo n.º 14
0
        public void ShouldCrawlPageLinks_IsAboveMaxCrawlDepth_ReturnsFalse()
        {
            CrawlDecision result = _unitUnderTest.ShouldCrawlPageLinks(
                new CrawledPage(new Uri("http://b.com/a.html"))
            {
                Content = new PageContent
                {
                    Text = "aaaa"
                },
                IsInternal = true,
                CrawlDepth = 3
            },
                new CrawlContext
            {
                RootUri            = new Uri("http://a.com/ "),
                CrawlConfiguration = new CrawlConfiguration
                {
                    MaxCrawlDepth = 2
                }
            });

            Assert.AreEqual(false, result.Allow);
            Assert.AreEqual("Crawl depth is above max", result.Reason);
            Assert.IsFalse(result.ShouldHardStopCrawl);
            Assert.IsFalse(result.ShouldStopCrawl);
        }
Exemplo n.º 15
0
        protected virtual bool ShouldCrawlPage(PageToCrawl pageToCrawl)
        {
            CrawlDecision shouldCrawlPageDecision = _crawlDecisionMaker.ShouldCrawlPage(pageToCrawl, _crawlContext);

            if (!shouldCrawlPageDecision.Allow &&
                shouldCrawlPageDecision.Reason.Contains("MaxPagesToCrawl limit of"))
            {
                _logger.LogInformation("MaxPagesToCrawlLimit has been reached or scheduled. No more pages will be scheduled.");
                return(false);
            }

            if (shouldCrawlPageDecision.Allow)
            {
                shouldCrawlPageDecision = (_shouldCrawlPageDecisionMaker != null) ? _shouldCrawlPageDecisionMaker.Invoke(pageToCrawl, _crawlContext) : CrawlDecision.AllowCrawl();
            }

            if (!shouldCrawlPageDecision.Allow)
            {
                _logger.LogDebug("Page [{0}] not crawled, [{1}]", pageToCrawl.Uri.AbsoluteUri, shouldCrawlPageDecision.Reason);
                FirePageCrawlDisallowedEventAsync(pageToCrawl, shouldCrawlPageDecision.Reason);
                //FirePageCrawlDisallowedEvent(pageToCrawl, shouldCrawlPageDecision.Reason);
            }

            return(shouldCrawlPageDecision.Allow);
        }
Exemplo n.º 16
0
        public void ShouldCrawlPageLinks_NullCrawledPage_ReturnsFalse()
        {
            CrawlDecision result = _unitUnderTest.ShouldCrawlPageLinks(null, new CrawlContext());

            Assert.IsFalse(result.Allow);
            Assert.AreEqual("Null crawled page", result.Reason);
        }
Exemplo n.º 17
0
        public void ShouldCrawlPage_OverMaxPagesToCrawlPerDomain_ReturnsFalse()
        {
            Uri uri = new Uri("http://a.com/");
            CrawlConfiguration config = new CrawlConfiguration
            {
                MaxPagesToCrawlPerDomain = 100
            };
            ConcurrentDictionary <string, int> countByDomain = new ConcurrentDictionary <string, int>();

            countByDomain.TryAdd(uri.Authority, 100);
            CrawlContext crawlContext = new CrawlContext
            {
                CrawlConfiguration = config,
                CrawlStartDate     = DateTime.Now,
                CrawlCountByDomain = countByDomain
            };

            CrawlDecision result = _unitUnderTest.ShouldCrawlPage(
                new PageToCrawl(new Uri(uri.AbsoluteUri + "anotherpage"))
            {
                IsInternal = true
            },
                crawlContext);

            Assert.IsFalse(result.Allow);
            Assert.AreEqual("MaxPagesToCrawlPerDomain limit of [100] has been reached for domain [a.com]", result.Reason);
            Assert.IsFalse(crawlContext.IsCrawlStopRequested);
        }
Exemplo n.º 18
0
        public void ShouldCrawlPage_NullPageToCrawl_ReturnsFalse()
        {
            CrawlDecision result = _unitUnderTest.ShouldCrawlPage(null, _crawlContext);

            Assert.IsFalse(result.Allow);
            Assert.AreEqual("Null page to crawl", result.Reason);
            Assert.IsFalse(_crawlContext.IsCrawlStopRequested);
        }
Exemplo n.º 19
0
        /// <summary>
        /// Make an http web request to the url and download its content based on the param func decision
        /// </summary>
        public virtual CrawledPage MakeRequest(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent)
        {
            if (uri == null)
            {
                throw new ArgumentNullException("uri");
            }

            CrawledPage crawledPage = new CrawledPage(uri);

            HttpWebRequest  request  = null;
            HttpWebResponse response = null;

            try
            {
                request  = BuildRequestObject(uri);
                response = (HttpWebResponse)request.GetResponse();
                ProcessResponseObject(response);
            }
            catch (WebException e)
            {
                crawledPage.WebException = e;

                if (e.Response != null)
                {
                    response = (HttpWebResponse)e.Response;
                }

                _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri);
                _logger.Debug(e);
            }
            catch (Exception e)
            {
                _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri);
                _logger.Debug(e);
            }
            finally
            {
                crawledPage.HttpWebRequest = request;

                if (response != null)
                {
                    crawledPage.HttpWebResponse = response;
                    CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage);
                    if (shouldDownloadContentDecision.Allow)
                    {
                        crawledPage.Content = _extractor.GetContent(response);
                    }
                    else
                    {
                        _logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason);
                    }

                    response.Close();//Should already be closed by _extractor but just being safe
                }
            }

            return(crawledPage);
        }
Exemplo n.º 20
0
        public void Constructor_ValidUri_CreatesInstance()
        {
            var unitUnderTest = new CrawlDecision();

            Assert.AreEqual(false, unitUnderTest.Allow);
            Assert.AreEqual("", unitUnderTest.Reason);
            Assert.IsFalse(unitUnderTest.ShouldHardStopCrawl);
            Assert.IsFalse(unitUnderTest.ShouldStopCrawl);
        }
Exemplo n.º 21
0
        public void ShouldDownloadPageContent_NonHtmlPage_ReturnsFalse()
        {
            Uri imageUrl = new Uri("http://localhost:1111/Content/themes/base/images/ui-bg_flat_0_aaaaaa_40x100.png");

            CrawlDecision result = _unitUnderTest.ShouldDownloadPageContent(new PageRequester(_crawlContext.CrawlConfiguration).MakeRequest(imageUrl), _crawlContext);

            Assert.AreEqual(false, result.Allow);
            Assert.AreEqual("Content type is not any of the following: text/html", result.Reason);
        }
Exemplo n.º 22
0
        /// <summary>
        /// Make an http web request to the url and download its content based on the param func decision
        /// </summary>
        public virtual CrawledPage MakeRequest(Uri uri, Func <CrawledPage, CrawlDecision> shouldDownloadContent)
        {
            if (uri == null)
            {
                throw new ArgumentNullException("uri");
            }

            CrawledPage crawledPage = new CrawledPage(uri);

            HttpWebRequest  request  = null;
            HttpWebResponse response = null;

            try
            {
                request  = BuildRequestObject(uri);
                response = (HttpWebResponse)request.GetResponse();
            }
            catch (WebException e)
            {
                crawledPage.WebException = e;

                if (e.Response != null)
                {
                    response = (HttpWebResponse)e.Response;
                }

                _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri);
                _logger.Debug(e);
            }
            catch (Exception e)
            {
                _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri);
                _logger.Debug(e);
            }
            finally
            {
                crawledPage.HttpWebRequest = request;

                if (response != null)
                {
                    crawledPage.HttpWebResponse = response;
                    CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage);
                    if (shouldDownloadContentDecision.Allow)
                    {
                        crawledPage.RawContent      = GetRawHtml(response, uri);
                        crawledPage.PageSizeInBytes = Encoding.UTF8.GetBytes(crawledPage.RawContent).Length;
                    }
                    else
                    {
                        _logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason);
                    }
                    response.Close();
                }
            }

            return(crawledPage);
        }
Exemplo n.º 23
0
        public void ShouldDownloadPageContent_HttpStatusNon200_ReturnsFalse()
        {
            Uri non200Uri = new Uri("http://localhost:1111/HttpResponse/Status403");

            CrawlDecision result = _unitUnderTest.ShouldDownloadPageContent(new PageRequester(_crawlContext.CrawlConfiguration).MakeRequest(non200Uri), new CrawlContext());

            Assert.AreEqual(false, result.Allow);
            Assert.AreEqual("HttpStatusCode is not 200", result.Reason);
        }
Exemplo n.º 24
0
        public void ShouldRecrawlPage_NullPageToCrawl_ReturnsFalse()
        {
            CrawlDecision result = _unitUnderTest.ShouldRecrawlPage(null, _crawlContext);

            Assert.IsFalse(result.Allow);
            Assert.AreEqual("Null crawled page", result.Reason);
            Assert.IsFalse(result.ShouldHardStopCrawl);
            Assert.IsFalse(result.ShouldStopCrawl);
        }
Exemplo n.º 25
0
        private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler()
        {
            IWebCrawler crawler = GetManuallyConfiguredWebCrawler(siteToCrawl);

            //Register a lambda expression that will make Abot not crawl any url that has the word "ghost" in it.
            //For example http://a.com/ghost, would not get crawled if the link were found during the crawl.
            //If you set the log4net log level to "DEBUG" you will see a log message when any page is not allowed to be crawled.
            //NOTE: This is lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPage method is run.
            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                //if (!pageToCrawl.Uri.AbsoluteUri.Contains("chicken") && !pageToCrawl.Uri.AbsoluteUri.Contains("Chicken"))
                if (!pageToCrawl.Uri.AbsoluteUri.Contains(category.Replace(" ", "+")) || /*pageToCrawl.Uri.AbsoluteUri.Contains("navid")||*/ pageToCrawl.Uri.AbsoluteUri.Contains("_KG") || pageToCrawl.Uri.AbsoluteUri.Contains("_EA"))
                {
                    return new CrawlDecision {
                        Allow = false, Reason = "I only crawl the right pages"
                    }
                }
                ;

                return(new CrawlDecision {
                    Allow = true
                });
            });

            //Register a lambda expression that will tell Abot to not download the page content for any page after 5th.
            //Abot will still make the http request but will not read the raw content from the stream
            //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldDownloadPageContent method is run

            /*crawler.ShouldDownloadPageContent((crawledPage, crawlContext) =>
             * {
             *  if (crawlContext.CrawledCount >= 5)
             *      return new CrawlDecision { Allow = false, Reason = "We already downloaded the raw page content for 5 pages" };
             *
             *  return new CrawlDecision { Allow = true };
             * });*/

            //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri.
            //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run
            crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) =>
            {
                CrawlDecision decision = new CrawlDecision {
                    Allow = true
                };
                if (crawledPage.Content.Bytes.Length < 100)
                {
                    return new CrawlDecision {
                        Allow = false, Reason = "Just crawl links in pages that have at least 100 bytes"
                    }
                }
                ;

                return(decision);
            });

            return(crawler);
        }
Exemplo n.º 26
0
        public void ShouldCrawlPageLinks_NullCrawlContext_ReturnsFalse()
        {
            CrawlDecision result = _unitUnderTest.ShouldCrawlPageLinks(new CrawledPage(new Uri("http://a.com/a.html"))
            {
                RawContent = "aaaa"
            }, null);

            Assert.IsFalse(result.Allow);
            Assert.AreEqual("Null crawl context", result.Reason);
        }
Exemplo n.º 27
0
        public void ShouldCrawlPageLinks_EmptyHtmlContent_ReturnsFalse()
        {
            CrawlDecision result = _unitUnderTest.ShouldCrawlPageLinks(new CrawledPage(new Uri("http://a.com/"))
            {
                RawContent = ""
            }, new CrawlContext());

            Assert.IsFalse(result.Allow);
            Assert.AreEqual("Page has no content", result.Reason);
        }
Exemplo n.º 28
0
        public void ShouldCrawlPage_Duplicate_ReturnsFalse()
        {
            _crawlContext.CrawledUrls = new ConcurrentDictionary <string, byte>();
            _crawlContext.CrawledUrls.TryAdd("http://a.com/", 0);

            CrawlDecision result = _unitUnderTest.ShouldCrawlPage(new PageToCrawl(new Uri("http://a.com/")), _crawlContext);

            Assert.IsFalse(result.Allow);
            Assert.AreEqual("Link already crawled", result.Reason);
            Assert.IsFalse(_crawlContext.IsCrawlStopRequested);
        }
Exemplo n.º 29
0
        public async Task ShouldDownloadPageContent_NonHtmlPage_ReturnsFalse()
        {
            Uri imageUrl = new Uri(string.Concat(unitTestConfig.SiteSimulatorBaseAddress, "themes/base/images/ui-bg_flat_0_aaaaaa_40x100.png"));

            CrawlDecision result = _unitUnderTest.ShouldDownloadPageContent(await new PageRequester(_crawlContext.CrawlConfiguration).MakeRequestAsync(imageUrl), _crawlContext);

            Assert.AreEqual(false, result.Allow);
            Assert.AreEqual("Content type is not any of the following: text/html", result.Reason);
            Assert.IsFalse(result.ShouldHardStopCrawl);
            Assert.IsFalse(result.ShouldStopCrawl);
        }
Exemplo n.º 30
0
        public async Task ShouldDownloadPageContent_HttpStatusNon200_ReturnsFalse()
        {
            Uri non200Uri = new Uri(string.Concat(unitTestConfig.SiteSimulatorBaseAddress, "/HttpResponse/Status403"));

            CrawlDecision result = _unitUnderTest.ShouldDownloadPageContent(await new PageRequester(_crawlContext.CrawlConfiguration).MakeRequestAsync(non200Uri), new CrawlContext());

            Assert.AreEqual(false, result.Allow);
            Assert.AreEqual("HttpStatusCode is not 200", result.Reason);
            Assert.IsFalse(result.ShouldHardStopCrawl);
            Assert.IsFalse(result.ShouldStopCrawl);
        }
Exemplo n.º 31
0
 protected virtual void SignalCrawlStopIfNeeded(CrawlDecision decision)
 {
     if (decision.ShouldHardStopCrawl)
     {
         _logger.InfoFormat("Decision marked crawl [Hard Stop] for site [{0}], [{1}]", _crawlContext.RootUri, decision.Reason);
         _crawlContext.IsCrawlHardStopRequested = decision.ShouldHardStopCrawl;
     }
     else if (decision.ShouldStopCrawl)
     {
         _logger.InfoFormat("Decision marked crawl [Stop] for site [{0}], [{1}]", _crawlContext.RootUri, decision.Reason);
         _crawlContext.IsCrawlStopRequested = decision.ShouldStopCrawl;
     }
 }