protected override string GetBaseHrefValue(CrawledPage crawledPage)
        {
            string hrefValue = "";
            HtmlNode node = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//base");

            //Must use node.InnerHtml instead of node.InnerText since "aaa<br />bbb" will be returned as "aaabbb"
            if (node != null)
                hrefValue = node.GetAttributeValue("href", "").Trim();

            return hrefValue;
        }
示例#2
0
        protected override IEnumerable <string> GetHrefValues(CrawledPage crawledPage)
        {
            List <string> hrefValues = new List <string>();

            if (HasRobotsNoFollow(crawledPage))
            {
                return(hrefValues);
            }

            HtmlNodeCollection aTags    = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//a[@href]");
            HtmlNodeCollection areaTags = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//area[@href]");

            hrefValues.AddRange(GetLinks(aTags));
            hrefValues.AddRange(GetLinks(areaTags));

            return(hrefValues);
        }
        public async Task PageCrawledAsync(CrawledPage crawledPage)
        {
            string text = _textExtractor.ExtractText(crawledPage.HtmlDocument);

            if (text == null)
            {
                Console.WriteLine("No content for page {0}", crawledPage?.Uri.AbsoluteUri);
                return;
            }

            _queue.Add(new WebPage(crawledPage.Uri.AbsoluteUri, text));

            if (_queue.Count > IndexingBatchSize)
            {
                await IndexBatchIfNecessary();
            }
        }
示例#4
0
        private void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;
            string      result      = "";

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                result = string.Format("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            }
            else
            {
                result = string.Format("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                result = string.Format("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
            }

            log.Info("crawler_ProcessPageCrawlCompleted");
            log.Info(result);

            if (!string.IsNullOrEmpty(crawledPage.Content.Text) && crawledPage.Uri.AbsoluteUri.Contains("/jobs/view/"))
            {
                var doc = crawledPage.HtmlDocument; //Html Agility Pack parser

                //var angleSharpHtmlDocument = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser

                try
                {
                    string positionTitle  = doc.DocumentNode.SelectSingleNode("//h1[@class='search_highlight']").InnerText.Trim();
                    string location       = doc.DocumentNode.SelectSingleNode("//div[@class='location search_highlight']").InnerText.Trim();
                    string companyName    = doc.DocumentNode.SelectSingleNode("//span[@class='search_highlight']//a").InnerText.Trim();
                    string postedDate     = doc.DocumentNode.SelectSingleNode("//div[@id='contentHeading']//div[@class='meta']").InnerText.Trim().Split(new char[] { '\n' })[0].Replace("Posted on : ", "");
                    string jobDescription = doc.DocumentNode.SelectSingleNode("//meta[@name='description']").Attributes["content"].Value;
                    string experience     = doc.DocumentNode.SelectSingleNode("//div[@class='field_experience_required']/div[@class='job-level']/span").InnerText.Trim();

                    log.Info(string.Format("Position: {0};Location: {1}; Company Name: {2}; Posted Date: {3}; Job Desc: {4}; Exp: {5}", positionTitle, location,
                                           companyName, postedDate, jobDescription, experience));
                }
                catch (Exception ex)
                {
                    log.Error(ex.Message);
                }
            }
        }
        public virtual CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext)
        {
            if (crawledPage == null)
            {
                return new CrawlDecision {
                           Allow = false, Reason = "Null crawled page"
                }
            }
            ;

            if (crawlContext == null)
            {
                return new CrawlDecision {
                           Allow = false, Reason = "Null crawl context"
                }
            }
            ;

            if (string.IsNullOrWhiteSpace(crawledPage.Content.Text))
            {
                return new CrawlDecision {
                           Allow = false, Reason = "Page has no content"
                }
            }
            ;

            if (!crawlContext.CrawlConfiguration.IsExternalPageLinksCrawlingEnabled && !crawledPage.IsInternal)
            {
                return new CrawlDecision {
                           Allow = false, Reason = "Link is external"
                }
            }
            ;

            if (crawledPage.CrawlDepth >= crawlContext.CrawlConfiguration.MaxCrawlDepth)
            {
                return new CrawlDecision {
                           Allow = false, Reason = "Crawl depth is above max"
                }
            }
            ;

            return(new CrawlDecision {
                Allow = true
            });
        }
示例#6
0
        protected virtual bool ShouldRecrawlPage(CrawledPage crawledPage)
        {
            //TODO No unit tests cover these lines
            var shouldRecrawlPageDecision = _crawlDecisionMaker.ShouldRecrawlPage(crawledPage, _crawlContext);

            if (shouldRecrawlPageDecision.Allow)
            {
                shouldRecrawlPageDecision = (_shouldRecrawlPageDecisionMaker != null) ? _shouldRecrawlPageDecisionMaker.Invoke(crawledPage, _crawlContext) : new CrawlDecision {
                    Allow = true
                }
            }
            ;

            if (!shouldRecrawlPageDecision.Allow)
            {
                _logger.LogDebug($"Page [{crawledPage.Uri.AbsoluteUri}] not recrawled, [{shouldRecrawlPageDecision.Reason}]");
            }
            else
            {
                // Look for the Retry-After header in the response.
                crawledPage.RetryAfter = null;
                if (crawledPage.HttpWebResponse != null &&
                    crawledPage.HttpWebResponse.Headers != null)
                {
                    var value = crawledPage.HttpWebResponse.GetResponseHeader("Retry-After");
                    if (!String.IsNullOrEmpty(value))
                    {
                        // Try to convert to DateTime first, then in double.
                        DateTime date;
                        double   seconds;
                        if (crawledPage.LastRequest.HasValue && DateTime.TryParse(value, out date))
                        {
                            crawledPage.RetryAfter = (date - crawledPage.LastRequest.Value).TotalSeconds;
                        }
                        else if (double.TryParse(value, out seconds))
                        {
                            crawledPage.RetryAfter = seconds;
                        }
                    }
                }
            }

            SignalCrawlStopIfNeeded(shouldRecrawlPageDecision);
            return(shouldRecrawlPageDecision.Allow);
        }
示例#7
0
        private void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (_amazonHelper.IsCaptchaPage(crawledPage))
            {
                lock (_captchaLock)
                {
                    InvokeIfRequired(() =>
                    {
                        var form = new BrowserForm();
                        form.Browser.Navigate(crawledPage.Uri);
                        form.ShowDialog();
                    });
                }
            }

            var products = _amazonHelper.GetProductsFromDetailPage(crawledPage);

            foreach (var p in products)
            {
                if (!_products.Any(x => x.external_product_id == p.external_product_id))
                {
                    InvokeIfRequired(() =>
                    {
                        _products.Add(p);
                        Application.DoEvents();
                    });
                }
            }

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                _logger.InfoFormat("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            }
            else
            {
                _logger.InfoFormat("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                _logger.InfoFormat("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
            }
        }
示例#8
0
        public void SetUp()
        {
            _uut = new ConstantContactCrawlProcessor();
            _fakePrimaryPersistence = new Mock <IPersistenceProvider>();
            _fakeBackupPersistence  = new Mock <IPersistenceProvider>();

            _crawlContext = new CrawlContext();
            _crawlContext.CrawlBag.GoDaddyProcessorContext = new ProcessorContext
            {
                PrimaryPersistenceProvider = _fakePrimaryPersistence.Object,
                BackupPersistenceProvider  = _fakeBackupPersistence.Object,
                Domain = new Domain
                {
                    DomainId = 111
                }
            };
            _crawledPage = new CrawledPage(new Uri("http://a.com/"));
        }
示例#9
0
        /// <summary>
        /// Parses html to extract hyperlinks, converts each into an absolute url
        /// </summary>
        public virtual IEnumerable <HyperLink> GetLinks(CrawledPage crawledPage)
        {
            CheckParams(crawledPage);

            var timer = Stopwatch.StartNew();

            var links = GetUris(crawledPage, GetHrefValues(crawledPage))
                        .Select(hrv => new HyperLink()
            {
                HrefValue = hrv
            })
                        .ToList();

            timer.Stop();
            Log.DebugFormat("{0} parsed links from [{1}] in [{2}] milliseconds", ParserType, crawledPage.Uri, timer.ElapsedMilliseconds);

            return(links);
        }
        protected override string GetBaseHrefValue(CrawledPage crawledPage)
        {
            var baseTag = crawledPage.AngleSharpHtmlDocument.QuerySelector("base");

            if (baseTag == null)
            {
                return("");
            }

            var baseTagValue = baseTag.Attributes["href"];

            if (baseTagValue == null)
            {
                return("");
            }

            return(baseTagValue.Value.Trim());
        }
        protected override IEnumerable<string> GetHrefValues(CrawledPage crawledPage)
        {
            if (HasRobotsNoFollow(crawledPage))
                return null;

            IEnumerable<string> hrefValues = crawledPage.CsQueryDocument.Select("a, area")
            .Elements
            .Where(e => !HasRelNoFollow(e))
            .Select(y => y.GetAttribute("href"))
            .Where(a => !string.IsNullOrWhiteSpace(a));

            IEnumerable<string> canonicalHref = crawledPage.CsQueryDocument.
                Select("link").Elements.
                Where(e => HasRelCanonicalPointingToDifferentUrl(e, crawledPage.Uri.ToString())).
                Select(e => e.Attributes["href"]);

            return hrefValues.Concat(canonicalHref);
        }
示例#12
0
        protected virtual bool ShouldCrawlPageLinks(CrawledPage crawledPage)
        {
            CrawlDecision shouldCrawlPageLinksDecision = _crawlDecisionMaker.ShouldCrawlPageLinks(crawledPage, _crawlContext);

            if (shouldCrawlPageLinksDecision.Allow)
            {
                shouldCrawlPageLinksDecision = (_shouldCrawlPageLinksDecisionMaker != null) ? _shouldCrawlPageLinksDecisionMaker.Invoke(crawledPage, _crawlContext) : CrawlDecision.AllowCrawl();
            }

            if (!shouldCrawlPageLinksDecision.Allow)
            {
                _logger.LogDebug("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldCrawlPageLinksDecision.Reason);
                FirePageLinksCrawlDisallowedEventAsync(crawledPage, shouldCrawlPageLinksDecision.Reason);
                //FirePageLinksCrawlDisallowedEvent(crawledPage, shouldCrawlPageLinksDecision.Reason);
            }

            return(shouldCrawlPageLinksDecision.Allow);
        }
示例#13
0
        public void SetUp()
        {
            _dummySemList           = new SemList();
            _uut                    = new SemKeywordCrawlProcessor(_dummySemList);
            _fakePrimaryPersistence = new Mock <IPersistenceProvider>();

            _crawlContext = new CrawlContext();
            _crawlContext.CrawlBag.GoDaddyProcessorContext = new ProcessorContext
            {
                PrimaryPersistenceProvider = _fakePrimaryPersistence.Object,
                BackupPersistenceProvider  = null,//no need since this is tested in another child class
                Domain = new Domain
                {
                    DomainId = 111
                }
            };
            _crawledPage = new CrawledPage(new Uri("http://a.com/"));
        }
        void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            }
            else
            {
                Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
            }
        }
        public void ProcessCrawledPage(CrawlContext crawlContext, CrawledPage crawledPage)
        {
            ProcessorResult result;

            bool isFound = false;

            if (!crawledPage.IsRoot)
            {
                return;
            }

            result = ProcessPage(crawlContext, crawledPage);

            if (result.IsAHit)
            {
                PageSave(crawlContext, crawledPage, result);
            }
        }
示例#16
0
        static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            }
            else
            {
                Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
            }
            if (crawledPage.Uri.AbsoluteUri != "https://belsat.eu/ru/news/")
            {
                Parser.Parse(crawledPage.Content.Text, crawledPage.Uri);
            }
            //crawledPage.Content.Text //raw html
        }
示例#17
0
        private void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;
            string      result      = "";

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                result = string.Format("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            }
            else
            {
                result = string.Format("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                result = string.Format("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
            }

            log.Info("crawler_ProcessPageCrawlCompleted");
            log.Info(result);

            if (!string.IsNullOrEmpty(crawledPage.Content.Text) && crawledPage.Uri.AbsoluteUri.Contains("/jobs/"))
            {
                var doc = crawledPage.HtmlDocument; //Html Agility Pack parser

                //var angleSharpHtmlDocument = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser

                try
                {
                    string positionTitle  = doc.DocumentNode.SelectSingleNode("//section[@class='box box_r']/ul/li/p").InnerText.Trim();
                    string location       = doc.DocumentNode.SelectNodes("//div[@class='cm-12 box_i bWord']/ul/li")[1].InnerText.Trim();
                    string companyName    = doc.DocumentNode.SelectSingleNode("//a[@id='urlverofertas']").InnerText.Trim();
                    string postedDate     = doc.DocumentNode.SelectSingleNode("//div[@class='cm-12 box_i bWord']/ul/p/span[@class='info_pub']/span").InnerText.Trim();
                    string jobDescription = doc.DocumentNode.SelectNodes("//div[@class='cm-12 box_i bWord']/ul/li")[2].InnerHtml.Trim();

                    //log.Info(string.Format("Position Title: {0}\nLocation: {1}\nCompany Name: {2}\nPosted Date: {3}\nJob Desc: {4}", positionTitle, location, companyName, postedDate, jobDescription));
                }
                catch (Exception ex)
                {
                    log.Error(ex.Message);
                }
            }
        }
示例#18
0
        public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_CrawlDelayAboveMinDomainCrawlDelay_CallsDomainRateLimiter()
        {
            Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            CrawledPage homePage = new CrawledPage(_rootUri)
            {
                RawContent = "content here"
            };
            CrawledPage page1 = new CrawledPage(uri1);
            CrawledPage page2 = new CrawledPage(uri2);

            List <Uri> links = new List <Uri> {
                uri1, uri2
            };

            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(homePage);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page1);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page2);
            _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });

            _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(3);//this is more then the max configured crawl delay (should be ignored)
            _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(true);
            _fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny <Uri>())).Returns(_fakeRobotsDotText.Object);

            _dummyConfiguration.IsRespectRobotsDotTextEnabled       = true; //BY HAVING A THIS EQUAL TO TRUE WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
            _dummyConfiguration.MaxRobotsDotTextCrawlDelayInSeconds = 2;    //This is less than the crawl delay (Should Be used)
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            _unitUnderTest.Crawl(_rootUri);

            _fakeHttpRequester.VerifyAll();
            _fakeHyperLinkParser.VerifyAll();
            _fakeRobotsDotText.VerifyAll();
            _fakeRobotsDotTextFinder.VerifyAll();
            _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny <Uri>(), 2000), Times.Exactly(1));
            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
        }
示例#19
0
        private void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;
            string      result      = "";

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                result = string.Format("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            }
            else
            {
                result = string.Format("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                result = string.Format("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
            }

            log.Info("crawler_ProcessPageCrawlCompleted");
            log.Info(result);

            if (!string.IsNullOrEmpty(crawledPage.Content.Text) && crawledPage.Uri.AbsoluteUri.Contains("/en/job/"))
            {
                var doc = crawledPage.HtmlDocument; //Html Agility Pack parser

                //var angleSharpHtmlDocument = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser

                try
                {
                    string positionTitle  = doc.DocumentNode.SelectSingleNode("//h1[@id='position_title']").InnerText.Trim();
                    string location       = doc.DocumentNode.SelectSingleNode("//span[@id='single_work_location']").InnerText.Trim();
                    string companyName    = doc.DocumentNode.SelectSingleNode("//div[@id='company_name']/a").InnerText.Trim();
                    string postedDate     = doc.DocumentNode.SelectSingleNode("//p[@id='posting_date']/span").InnerText.Trim();
                    string jobDescription = doc.DocumentNode.SelectSingleNode("//div[@id='job_description']").InnerHtml.Trim();

                    //log.Info(string.Format("Position Title: {0}\nLocation: {1}\nCompany Name: {2}\nPosted Date: {3}\nJob Desc: {4}", positionTitle, location, companyName, postedDate, jobDescription));
                }
                catch (Exception ex)
                {
                    log.Error(ex.Message);
                }
            }
        }
示例#20
0
        static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (isContains(crawledPage.Uri.AbsoluteUri))
            {
                // Create a logger for use in this class
                //log4net.ILog log = log4net.LogManager.GetLogger(System.Reflection.MethodBase.GetCurrentMethod().DeclaringType);

                //log.Info("Page URL : " + crawledPage.Uri.AbsoluteUri);
                CrawledItems.Add(new Crawled()
                {
                    Url = crawledPage.Uri.AbsoluteUri, Description = crawledPage.Content.Text
                });
                if (CrawledItems.Count >= 10)
                {
                    AddToDatabase(CrawledItems.ToArray());
                    CrawledItems.Clear();
                    Console.WriteLine("Submit 10 new records");
                }
            }

            /*
             * int count = 0;
             *
             * foreach (var item in crawledPage.ParsedLinks)
             * {
             *  log.Info("link :"+ ++count +item.AbsoluteUri+", "+(item.IsFile?"ini file":"ini bukan file"));
             * }*/
            //log.Info(crawledPage.Content.Text);
            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            }

            //log.Info(string.Format("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri));

            Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
            }
        }
示例#21
0
        public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_UsesCorrectUserAgentString()
        {
            Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            CrawledPage homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent
                {
                    Text = "content here"
                }
            };
            CrawledPage page1 = new CrawledPage(uri1);
            CrawledPage page2 = new CrawledPage(uri2);

            List <Uri> links = new List <Uri> {
                uri1, uri2
            };

            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(homePage);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page1);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page2);
            _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision {
                Allow = true
            });

            _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(0);
            _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(true);
            _fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny <Uri>())).Returns(_fakeRobotsDotText.Object);

            _dummyConfiguration.IsRespectRobotsDotTextEnabled = true;
            _dummyConfiguration.RobotsDotTextUserAgentString  = "abcd";
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            _unitUnderTest.Crawl(_rootUri);

            _fakeRobotsDotText.Verify(f => f.GetCrawlDelay(_dummyConfiguration.RobotsDotTextUserAgentString), Times.Exactly(1));
            _fakeRobotsDotText.Verify(f => f.IsUrlAllowed(uri1.AbsoluteUri, _dummyConfiguration.RobotsDotTextUserAgentString), Times.Exactly(1));
            _fakeRobotsDotText.Verify(f => f.IsUrlAllowed(uri1.AbsoluteUri, _dummyConfiguration.RobotsDotTextUserAgentString), Times.Exactly(1));
        }
示例#22
0
        public static string FormatOutput(CrawledPage page)
        {
            string linksDisplay;

            if (page.PageLinks == null)
            {
                linksDisplay = "Could not download content";
            }
            else if (page.PageLinks.Count == 0)
            {
                linksDisplay = "No links found";
            }
            else
            {
                linksDisplay = $"{string.Join("\n", page.PageLinks)}\n[{page.PageLinks.Count} links]";
            }

            return($"Visited Page: {page.PageUri} ({page.FirstVisitedDepth})\n------------------\n{linksDisplay}\n");
        }
        public virtual CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext)
        {
            if (crawledPage == null)
                return new CrawlDecision{Allow = false, Reason = "Null crawled page"};

            if (crawlContext == null)
                return new CrawlDecision { Allow = false, Reason = "Null crawl context" };

            if(string.IsNullOrWhiteSpace(crawledPage.Content.Text))
                return new CrawlDecision { Allow = false, Reason = "Page has no content" };

            if (!crawlContext.CrawlConfiguration.IsExternalPageLinksCrawlingEnabled && !crawledPage.IsInternal)
                return new CrawlDecision { Allow = false, Reason = "Link is external" };

            if (crawledPage.CrawlDepth >= crawlContext.CrawlConfiguration.MaxCrawlDepth)
                return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" };

            return new CrawlDecision{Allow = true};
        }
示例#24
0
        async void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;
            string      uri         = crawledPage.Uri.AbsoluteUri;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse?.StatusCode != HttpStatusCode.OK)
            {
                Console.WriteLine("Crawl of page failed {0}: exception '{1}', response status {2}", uri, crawledPage.WebException?.Message, crawledPage.HttpWebResponse?.StatusCode);
                return;
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                Console.WriteLine("Page had no content {0}", uri);
                return;
            }

            await _handler.PageCrawledAsync(crawledPage);
        }
示例#25
0
        public IRobotsDotText Find(Uri rootUri)
        {
            if (rootUri == null)
            {
                throw new ArgumentNullException("rootUri");
            }

            Uri         robotsUri = new Uri(rootUri, "/robots.txt");
            CrawledPage page      = _pageRequester.MakeRequest(robotsUri);

            if (page == null || page.WebException != null || page.HttpWebResponse == null || page.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                _logger.DebugFormat("Did not find robots.txt file at [{0}]", robotsUri);
                return(null);
            }

            _logger.DebugFormat("Found robots.txt file at [{0}]", robotsUri);
            return(new RobotsDotText(rootUri, page.Content.Text));
        }
示例#26
0
        protected virtual bool HasRobotsNoFollow(CrawledPage crawledPage)
        {
            if (!IsRespectMetaRobotsNoFollowEnabled)
            {
                return(false);
            }

            string robotsMeta       = robotsMeta = GetMetaRobotsValue(crawledPage);
            bool   isRobotsNoFollow = robotsMeta != null &&
                                      (robotsMeta.ToLower().Contains("nofollow") ||
                                       robotsMeta.ToLower().Contains("none"));

            if (isRobotsNoFollow)
            {
                _logger.InfoFormat("Robots NoFollow detected on uri [{0}], will not crawl links on this page.", crawledPage.Uri);
            }

            return(isRobotsNoFollow);
        }
示例#27
0
        protected override IEnumerable <string> GetHrefValues(CrawledPage crawledPage)
        {
            var hrefValues = new List <string>();

            if (HasRobotsNoFollow(crawledPage))
            {
                return(hrefValues);
            }

            var aTags      = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//a[@href]");
            var areaTags   = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//area[@href]");
            var canonicals = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//link[@rel='canonical'][@href]");

            hrefValues.AddRange(GetLinks(aTags));
            hrefValues.AddRange(GetLinks(areaTags));
            hrefValues.AddRange(GetLinks(canonicals));

            return(hrefValues);
        }
示例#28
0
        static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
            }
            else
            {
                Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
            }

            //if (string.IsNullOrEmpty(crawledPage.Content.Text))
            //Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);

            //var htmlAgilityPackDocument = crawledPage.HtmlDocument; //Html Agility Pack parser
            //var angleSharpHtmlDocument = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser
        }
        protected override IEnumerable <string> GetHrefValues(CrawledPage crawledPage)
        {
            if (HasRobotsNoFollow(crawledPage))
            {
                return(null);
            }

            IEnumerable <string> hrefValues = crawledPage.AngleSharpHtmlDocument.QuerySelectorAll("a, area")
                                              .Where(e => !HasRelNoFollow(e))
                                              .Select(y => y.GetAttribute("href"))
                                              .Where(a => !string.IsNullOrWhiteSpace(a));

            IEnumerable <string> canonicalHref = crawledPage.AngleSharpHtmlDocument
                                                 .QuerySelectorAll("link")
                                                 .Where(e => HasRelCanonicalPointingToDifferentUrl(e, crawledPage.Uri.ToString()))
                                                 .Select(e => e.GetAttribute("href"));

            return(hrefValues.Concat(canonicalHref));
        }
示例#30
0
        protected override ProcessorResult ProcessPage(CrawlContext crawlContext, CrawledPage crawledPage)
        {
            nodeQueryList.Add(new KeyValuePair <string, string>("cart", "//a[contains(@href, 'cart')]"));
            nodeQueryList.Add(new KeyValuePair <string, string>("shoppingcart", "//a[contains(@href, 'shoppingcart')]"));
            nodeQueryList.Add(new KeyValuePair <string, string>("checkout", "//a[contains(@href, 'checkout')]"));

            ProcessorResult result = new ProcessorResult {
                UniqueAttributeId = 16
            };

            result.IsAHit = FindTags(crawledPage, crawlContext.RootUri.DnsSafeHost.ToLower());

            if (result.IsAHit)
            {
                result.Attributes.Add(result.UniqueAttributeId.ToString(), "true");
            }

            return(result);
        }
示例#31
0
        protected override IEnumerable <string> GetHrefValues(CrawledPage crawledPage)
        {
            List <string> hrefValues = new List <string>();

            if (HasRobotsNoFollow(crawledPage))
            {
                return(hrefValues);
            }

            //HtmlNodeCollection productLinkNodes = _helper.GetProductLinkNodes(crawledPage.HtmlDocument);
            //hrefValues.AddRange(GetLinks(productLinkNodes));
            var productLinks = _helper.GetProductLinksFromListPage(crawledPage);

            hrefValues.AddRange(productLinks);
            HtmlNodeCollection pageLinkNodes = _helper.GetPageLinkNodesFromListPage(crawledPage);

            hrefValues.AddRange(GetLinks(pageLinkNodes));
            return(hrefValues.Distinct().ToList());
        }
示例#32
0
        protected virtual void SchedulePageLinks(CrawledPage crawledPage)
        {
            IEnumerable <Uri> crawledPageLinks = _hyperLinkParser.GetLinks(crawledPage);

            foreach (Uri uri in crawledPageLinks)
            {
                //Added due to a bug in the Uri class related to this (http://stackoverflow.com/questions/2814951/system-uriformatexception-invalid-uri-the-hostname-could-not-be-parsed)
                try
                {
                    PageToCrawl page = new CrawledPage(uri);
                    page.ParentUri  = crawledPage.Uri;
                    page.CrawlDepth = crawledPage.CrawlDepth + 1;
                    page.IsInternal = _isInternalDecisionMaker(uri, _crawlContext.RootUri);
                    page.IsRoot     = false;
                    _scheduler.Add(page);
                }
                catch {}
            }
        }
示例#33
0
        protected virtual void ProcessPage(PageToCrawl pageToCrawl)
        {
            if (pageToCrawl == null)
            {
                return;
            }

            try
            {
                ThrowIfCancellationRequested();

                AddPageToContext(pageToCrawl);

                CrawledPage crawledPage = CrawlThePage(pageToCrawl);

                FirePageCrawlCompletedEventAsync(crawledPage);

                bool shouldCrawlPageLinks = ShouldCrawlPageLinks(crawledPage);
                if (shouldCrawlPageLinks)
                {
                    ParsePageDocument(crawledPage);
                    SchedulePageLinks(crawledPage);
                }

                if (crawledPage.IsRetry || ShouldRecrawlPage(crawledPage))
                {
                    crawledPage.IsRetry = true;
                    _scheduler.Add(crawledPage);
                }
            }
            catch (OperationCanceledException oce)
            {
                _logger.LogDebug("Thread cancelled while crawling/processing page [{0}]", pageToCrawl.Uri);
                throw oce;
            }
            catch (Exception e)
            {
                _logger.LogError(e, e.Message);
                _crawlResult.Error = e;
                _crawlContext.IsCrawlHardStopRequested = true;
            }
        }
示例#34
0
        protected virtual void FirePageCrawlCompletedEventAsync(CrawledPage crawledPage)
        {
            EventHandler<PageCrawlCompletedArgs> threadSafeEvent = PageCrawlCompletedAsync;

            if (threadSafeEvent == null)
                return;

            if (_scheduler.Count == 0)
            {
                //Must be fired synchronously to avoid main thread exiting before completion of event handler for first or last page crawled
                try
                {
                    threadSafeEvent(this, new PageCrawlCompletedArgs(_crawlContext, crawledPage));
                }
                catch (Exception e)
                {
                    _logger.Error("An unhandled exception was thrown by a subscriber of the PageCrawlCompleted event for url:" + crawledPage.Uri.AbsoluteUri);
                    _logger.Error(e);
                }
            }
            else
            {
                //Fire each subscribers delegate async
                foreach (EventHandler<PageCrawlCompletedArgs> del in threadSafeEvent.GetInvocationList())
                {
                    del.BeginInvoke(this, new PageCrawlCompletedArgs(_crawlContext, crawledPage), null, null);
                }
            }
        }
示例#35
0
        protected virtual CrawlDecision ShouldDownloadPageContent(CrawledPage crawledPage)
        {
            CrawlDecision decision = _crawlDecisionMaker.ShouldDownloadPageContent(crawledPage, _crawlContext);
            if (decision.Allow)
                decision = (_shouldDownloadPageContentDecisionMaker != null) ? _shouldDownloadPageContentDecisionMaker.Invoke(crawledPage, _crawlContext) : new CrawlDecision { Allow = true };

            SignalCrawlStopIfNeeded(decision);
            return decision;
        }
 protected override string GetBaseHrefValue(CrawledPage crawledPage)
 {
     string baseTagValue = crawledPage.CsQueryDocument.Select("base").Attr("href") ?? "";
     return baseTagValue.Trim();
 }
示例#37
0
 protected virtual void FirePageLinksCrawlDisallowedEventAsync(CrawledPage crawledPage, string reason)
 {
     EventHandler<PageLinksCrawlDisallowedArgs> threadSafeEvent = PageLinksCrawlDisallowedAsync;
     if (threadSafeEvent != null)
     {
         //Fire each subscribers delegate async
         foreach (EventHandler<PageLinksCrawlDisallowedArgs> del in threadSafeEvent.GetInvocationList())
         {
             del.BeginInvoke(this, new PageLinksCrawlDisallowedArgs(_crawlContext, crawledPage, reason), null, null);
         }
     }
 }
示例#38
0
 protected virtual void FirePageLinksCrawlDisallowedEvent(CrawledPage crawledPage, string reason)
 {
     try
     {
         EventHandler<PageLinksCrawlDisallowedArgs> threadSafeEvent = PageLinksCrawlDisallowed;
         if (threadSafeEvent != null)
             threadSafeEvent(this, new PageLinksCrawlDisallowedArgs(_crawlContext, crawledPage, reason));
     }
     catch (Exception e)
     {
         _logger.Error("An unhandled exception was thrown by a subscriber of the PageLinksCrawlDisallowed event for url:" + crawledPage.Uri.AbsoluteUri);
         _logger.Error(e);
     }
 }
示例#39
0
 protected virtual bool IsRedirect(CrawledPage crawledPage)
 {
     bool isRedirect = false;
     if (crawledPage.HttpWebResponse != null) {
         isRedirect = (_crawlContext.CrawlConfiguration.IsHttpRequestAutoRedirectsEnabled &&
             crawledPage.HttpWebResponse.ResponseUri != null &&
             crawledPage.HttpWebResponse.ResponseUri.AbsoluteUri != crawledPage.Uri.AbsoluteUri) ||
             (!_crawlContext.CrawlConfiguration.IsHttpRequestAutoRedirectsEnabled &&
             (int) crawledPage.HttpWebResponse.StatusCode >= 300 &&
             (int) crawledPage.HttpWebResponse.StatusCode <= 399);
     }
     return isRedirect;
 }
        protected override IEnumerable<string> GetHrefValues(CrawledPage crawledPage)
        {
            List<string> hrefValues = new List<string>();
            if (HasRobotsNoFollow(crawledPage))
                return hrefValues;

            HtmlNodeCollection aTags = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//a[@href]");
            HtmlNodeCollection areaTags = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//area[@href]");
            HtmlNodeCollection canonicals = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//link[@rel='canonical'][@href]");

            hrefValues.AddRange(GetLinks(aTags));
            hrefValues.AddRange(GetLinks(areaTags));
            hrefValues.AddRange(GetLinks(canonicals));
            hrefValues.AddRange(GetLinksByKeyword(crawledPage, "KeywordExternalLink"));
            hrefValues.AddRange(GetLinksByKeyword(crawledPage, "KeywordID"));

            return hrefValues;
        }
示例#41
0
        /// <summary>
        /// Retrieve the URI where the specified crawled page was redirected.
        /// </summary>
        /// <remarks>
        /// If HTTP auto redirections is disabled, this value is stored in the 'Location' header of the response.
        /// If auto redirections is enabled, this value is stored in the response's ResponseUri property.
        /// </remarks>
        protected virtual Uri ExtractRedirectUri(CrawledPage crawledPage)
        {
            Uri locationUri;
            if (_crawlContext.CrawlConfiguration.IsHttpRequestAutoRedirectsEnabled) {
                // For auto redirects, look for the response uri.
                locationUri = crawledPage.HttpWebResponse.ResponseUri;
            } else {
                // For manual redirects, we need to look for the location header.
                var location = crawledPage.HttpWebResponse.Headers["Location"];

                // Check if the location is absolute. If not, create an absolute uri.
                if (!Uri.TryCreate(location, UriKind.Absolute, out locationUri))
                {
                    Uri baseUri = new Uri(crawledPage.Uri.GetLeftPart(UriPartial.Authority));
                    locationUri = new Uri(baseUri, location);
                }
            }
            return locationUri;
        }
        private IEnumerable<string> GetLinksByKeyword(CrawledPage crawledPage, string keyword)
        {
            List<string> result = new List<string>();
            string keywordValue = string.Empty;

            if (Utility.GetConfigurationValue(_config, keyword, out keywordValue))
            {
                if (!string.IsNullOrEmpty(keywordValue))
                {
                    switch (keyword)
                    {
                        case "KeywordExternalLink":
                            MatchCollection matches = Regex.Matches(crawledPage.HtmlDocument.DocumentNode.OuterHtml, keywordValue + @"&quot;:&quot;[A-Za-z|-]+/\d{1,10}&quot;,", RegexOptions.IgnoreCase);
                            foreach (Match item in matches)
                            {
                                Match match = Regex.Match(item.Value, @"[A-Za-z|-]+/\d{1,10}", RegexOptions.IgnoreCase);
                                result.Add(Utility.ConvertRelativeUrl(crawledPage.ParentUri.AbsoluteUri, match.Value));
                            }
                            break;
                        case "KeywordID":
                            matches = Regex.Matches(crawledPage.HtmlDocument.DocumentNode.OuterHtml, @"],&quot;" + keywordValue + @"&quot;:\d{1,10},", RegexOptions.IgnoreCase);
                            foreach (Match item in matches)
                            {
                                Match match = Regex.Match(item.Value, @"\d{1,10}", RegexOptions.IgnoreCase);
                                result.Add(Utility.ConvertRelativeUrl(crawledPage.ParentUri.AbsoluteUri, "/hands-on-labs/" + match.Value));
                            }
                            break;
                        default:
                            break;
                    }
                }
            }

            return result;
        }
 protected override string GetMetaRobotsValue(CrawledPage crawledPage)
 {
     return crawledPage.CsQueryDocument["meta[name]"].Filter(d => d.Name.ToLowerInvariant() == "robots").Attr("content");
 }
示例#44
0
 protected virtual void ParsePageLinks(CrawledPage crawledPage)
 {
     crawledPage.ParsedLinks = _hyperLinkParser.GetLinks(crawledPage);
 }
示例#45
0
        protected virtual void ProcessRedirect(CrawledPage crawledPage)
        {
            if (crawledPage.RedirectPosition >= 20)
                _logger.WarnFormat("Page [{0}] is part of a chain of 20 or more consecutive redirects, redirects for this chain will now be aborted.", crawledPage.Uri);

            try
            {
                var uri = ExtractRedirectUri(crawledPage);

                PageToCrawl page = new PageToCrawl(uri);
                page.ParentUri = crawledPage.ParentUri;
                page.CrawlDepth = crawledPage.CrawlDepth;
                page.IsInternal = IsInternalUri(uri);
                page.IsRoot = false;
                page.RedirectedFrom = crawledPage;
                page.RedirectPosition = crawledPage.RedirectPosition + 1;

                crawledPage.RedirectedTo = page;
                _logger.DebugFormat("Page [{0}] is requesting that it be redirect to [{1}]", crawledPage.Uri, crawledPage.RedirectedTo.Uri);

                if (ShouldSchedulePageLink(page))
                {
                    if (_scheduler.IsUriKnown(uri))
                    {
                        _logger.InfoFormat("Page [{0}] is redirected to [{1}], which is a page already crawled.", crawledPage.Uri, crawledPage.RedirectedTo.Uri);
                    }
                    else
                    {
                        _logger.InfoFormat("Page [{0}] will be redirect to [{1}]", crawledPage.Uri, crawledPage.RedirectedTo.Uri);
                        _scheduler.Add(page);
                    }
                }
            }
            catch {}
        }
示例#46
0
        protected virtual void SchedulePageLinks(CrawledPage crawledPage)
        {
            foreach (Uri uri in crawledPage.ParsedLinks)
            {
                // First validate that the link was not already visited or added to the list of pages to visit, so we don't
                // make the same validation and fire the same events twice.
                if (!_scheduler.IsUriKnown(uri) &&
                    (_shouldScheduleLinkDecisionMaker == null || _shouldScheduleLinkDecisionMaker.Invoke(uri, crawledPage, _crawlContext))) {
                    try //Added due to a bug in the Uri class related to this (http://stackoverflow.com/questions/2814951/system-uriformatexception-invalid-uri-the-hostname-could-not-be-parsed)
                    {
                        PageToCrawl page = new PageToCrawl(uri);
                        page.ParentUri = crawledPage.Uri;
                        page.CrawlDepth = crawledPage.CrawlDepth + 1;
                        page.IsInternal = IsInternalUri(uri);
                        page.IsRoot = false;

                        if (ShouldSchedulePageLink(page))
                        {
                            _scheduler.Add(page);
                        }
                    }
                    catch { }
                }

                // Add this link to the list of known Urls so validations are not duplicated in the future.
                _scheduler.AddKnownUri(uri);
            }
        }
示例#47
0
        protected virtual bool ShouldCrawlPageLinks(CrawledPage crawledPage)
        {
            CrawlDecision shouldCrawlPageLinksDecision = _crawlDecisionMaker.ShouldCrawlPageLinks(crawledPage, _crawlContext);
            if (shouldCrawlPageLinksDecision.Allow)
                shouldCrawlPageLinksDecision = (_shouldCrawlPageLinksDecisionMaker != null) ? _shouldCrawlPageLinksDecisionMaker.Invoke(crawledPage, _crawlContext) : new CrawlDecision { Allow = true };

            if (!shouldCrawlPageLinksDecision.Allow)
            {
                _logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldCrawlPageLinksDecision.Reason);
                FirePageLinksCrawlDisallowedEventAsync(crawledPage, shouldCrawlPageLinksDecision.Reason);
                FirePageLinksCrawlDisallowedEvent(crawledPage, shouldCrawlPageLinksDecision.Reason);
            }

            SignalCrawlStopIfNeeded(shouldCrawlPageLinksDecision);
            return shouldCrawlPageLinksDecision.Allow;
        }
示例#48
0
        protected virtual bool ShouldRecrawlPage(CrawledPage crawledPage)
        {
            //TODO No unit tests cover these lines
            CrawlDecision shouldRecrawlPageDecision = _crawlDecisionMaker.ShouldRecrawlPage(crawledPage, _crawlContext);
            if (shouldRecrawlPageDecision.Allow)
                shouldRecrawlPageDecision = (_shouldRecrawlPageDecisionMaker != null) ? _shouldRecrawlPageDecisionMaker.Invoke(crawledPage, _crawlContext) : new CrawlDecision { Allow = true };

            if (!shouldRecrawlPageDecision.Allow)
            {
                _logger.DebugFormat("Page [{0}] not recrawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldRecrawlPageDecision.Reason);
            }
            else
            {
                // Look for the Retry-After header in the response.
                crawledPage.RetryAfter = null;
                if (crawledPage.HttpWebResponse != null &&
                    crawledPage.HttpWebResponse.Headers != null)
                {
                    string value = crawledPage.HttpWebResponse.GetResponseHeader("Retry-After");
                    if (!String.IsNullOrEmpty(value))
                    {
                        // Try to convert to DateTime first, then in double.
                        DateTime date;
                        double seconds;
                        if (crawledPage.LastRequest.HasValue && DateTime.TryParse(value, out date))
                        {
                            crawledPage.RetryAfter = (date - crawledPage.LastRequest.Value).TotalSeconds;
                        }
                        else if (double.TryParse(value, out seconds))
                        {
                            crawledPage.RetryAfter = seconds;
                        }
                    }
                }
            }

            SignalCrawlStopIfNeeded(shouldRecrawlPageDecision);
            return shouldRecrawlPageDecision.Allow;
        }
        public virtual CrawlDecision ShouldDownloadPageContent(CrawledPage crawledPage, CrawlContext crawlContext)
        {
            if (crawledPage == null)
                return new CrawlDecision { Allow = false, Reason = "Null crawled page" };

            if (crawlContext == null)
                return new CrawlDecision { Allow = false, Reason = "Null crawl context" };

            if (crawledPage.HttpWebResponse == null)
                return new CrawlDecision { Allow = false, Reason = "Null HttpWebResponse" };

            if (crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
                return new CrawlDecision { Allow = false, Reason = "HttpStatusCode is not 200" };

            string pageContentType = crawledPage.HttpWebResponse.ContentType.ToLower().Trim();
            bool isDownloadable = false;
            List<string> cleanDownloadableContentTypes = crawlContext.CrawlConfiguration.DownloadableContentTypes
                .Split(',')
                .Select(t => t.Trim())
                .Where(t => !string.IsNullOrEmpty(t))
                .ToList();

            foreach (string downloadableContentType in cleanDownloadableContentTypes)
            {
                if (pageContentType.Contains(downloadableContentType.ToLower().Trim()))
                {
                    isDownloadable = true;
                    break;
                }
            }
            if (!isDownloadable)
                return new CrawlDecision { Allow = false, Reason = "Content type is not any of the following: " + string.Join(",", cleanDownloadableContentTypes) };

            if (crawlContext.CrawlConfiguration.MaxPageSizeInBytes > 0 && crawledPage.HttpWebResponse.ContentLength > crawlContext.CrawlConfiguration.MaxPageSizeInBytes)
                return new CrawlDecision { Allow = false, Reason = string.Format("Page size of [{0}] bytes is above the max allowable of [{1}] bytes", crawledPage.HttpWebResponse.ContentLength, crawlContext.CrawlConfiguration.MaxPageSizeInBytes) };

            return new CrawlDecision { Allow = true };
        }
示例#50
0
        /// <summary>
        /// Validate that the Root page was not redirected. If the root page is redirected, we assume that the root uri
        /// should be changed to the uri where it was redirected.
        /// </summary>
        protected virtual void ValidateRootUriForRedirection(CrawledPage crawledRootPage)
        {
            if (!crawledRootPage.IsRoot) {
                throw new ArgumentException("The crawled page must be the root page to be validated for redirection.");
            }

            if (IsRedirect(crawledRootPage)) {
                _crawlContext.RootUri = ExtractRedirectUri(crawledRootPage);
                _logger.InfoFormat("The root URI [{0}] was redirected to [{1}]. [{1}] is the new root.",
                    _crawlContext.OriginalRootUri,
                    _crawlContext.RootUri);
            }
        }
        protected override string GetMetaRobotsValue(CrawledPage crawledPage)
        {
            string robotsMeta = null;
            HtmlNode robotsNode = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//meta[translate(@name,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='robots']");
            if (robotsNode != null)
                robotsMeta = robotsNode.GetAttributeValue("content", "");

            return robotsMeta;
        }
示例#52
0
 protected virtual bool PageSizeIsAboveMax(CrawledPage crawledPage)
 {
     bool isAboveMax = false;
     if (_crawlContext.CrawlConfiguration.MaxPageSizeInBytes > 0 &&
         crawledPage.Content.Bytes != null &&
         crawledPage.Content.Bytes.Length > _crawlContext.CrawlConfiguration.MaxPageSizeInBytes)
     {
         isAboveMax = true;
         _logger.InfoFormat("Page [{0}] has a page size of [{1}] bytes which is above the [{2}] byte max, no further processing will occur for this page", crawledPage.Uri, crawledPage.Content.Bytes.Length, _crawlContext.CrawlConfiguration.MaxPageSizeInBytes);
     }
     return isAboveMax;
 }
        public virtual CrawlDecision ShouldRecrawlPage(CrawledPage crawledPage, CrawlContext crawlContext)
        {
            if (crawledPage == null)
                return new CrawlDecision { Allow = false, Reason = "Null crawled page" };

            if (crawlContext == null)
                return new CrawlDecision { Allow = false, Reason = "Null crawl context" };

            if (crawledPage.WebException == null)
                return new CrawlDecision { Allow = false, Reason = "WebException did not occur"};

            if (crawlContext.CrawlConfiguration.MaxRetryCount < 1)
                return new CrawlDecision { Allow = false, Reason = "MaxRetryCount is less than 1"};

            if (crawledPage.RetryCount >= crawlContext.CrawlConfiguration.MaxRetryCount)
                return new CrawlDecision {Allow = false, Reason = "MaxRetryCount has been reached"};

            return new CrawlDecision { Allow = true };
        }