public void ShouldCrawlPage_OverMaxPagesToCrawlPerDomain_IsRetry_ReturnsTrue()
        {
            Uri uri = new Uri("http://a.com/");
            CrawlConfiguration config = new CrawlConfiguration
            {
                MaxPagesToCrawlPerDomain = 100
            };
            ConcurrentDictionary <string, int> countByDomain = new ConcurrentDictionary <string, int>();

            countByDomain.TryAdd(uri.Authority, 100);
            CrawlContext crawlContext = new CrawlContext
            {
                CrawlConfiguration = config,
                CrawlStartDate     = DateTime.Now,
                CrawlCountByDomain = countByDomain
            };

            CrawlDecision result = _unitUnderTest.ShouldCrawlPage(
                new PageToCrawl(new Uri(uri.AbsoluteUri + "anotherpage"))
            {
                IsRetry    = true,
                IsInternal = true
            },
                crawlContext);

            Assert.IsTrue(result.Allow);
            Assert.IsFalse(result.ShouldHardStopCrawl);
            Assert.IsFalse(result.ShouldStopCrawl);
        }
        public CrawlArgs(CrawlContext crawlContext)
        {
            if (crawlContext == null)
                throw new ArgumentNullException("crawlContext");

            CrawlContext = crawlContext;
        }
Exemple #3
0
        /// <summary>
        /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor.
        /// </summary>
        /// <param name="threadManager">Distributes http requests over multiple threads</param>
        /// <param name="scheduler">Decides what link should be crawled next</param>
        /// <param name="pageRequester">Makes the raw http requests</param>
        /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param>
        /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param>
        /// <param name="crawlConfiguration">Configurable crawl values</param>
        /// <param name="memoryManager">Checks the memory usage of the host process</param>
        public WebCrawler(
            CrawlConfiguration crawlConfiguration,
            ICrawlDecisionMaker crawlDecisionMaker,
            IThreadManager threadManager,
            IScheduler scheduler,
            IPageRequester pageRequester,
            IHyperLinkParser hyperLinkParser,
            IMemoryManager memoryManager)
        {
            _crawlContext = new CrawlContext();
            _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile();
            CrawlBag = _crawlContext.CrawlBag;

            _threadManager      = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount);
            _scheduler          = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null);
            _pageRequester      = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration);
            _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker();

            if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 ||
                _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0)
            {
                _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds));
            }

            _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(_crawlContext.CrawlConfiguration.IsRespectMetaRobotsNoFollowEnabled, _crawlContext.CrawlConfiguration.IsRespectAnchorRelNoFollowEnabled);

            _crawlContext.Scheduler = _scheduler;
        }
        public void Consume_ValidDomain_CrawlerCrawlBagSet()
        {
            //Arrange
            Domain domain = new Domain {
                DomainId = 1, Uri = new Uri("http://a.com")
            };
            CrawlContext context    = GetCrawlContext(_dummyCrawlProcessors);
            CrawlResult  fakeResult = new CrawlResult {
                CrawlContext = context
            };

            _fakeWebCrawlerFactory.Setup(f => f.CreateInstance()).Returns(_fakeWebCrawler.Object);
            _fakeWebCrawler.Setup(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>())).Returns(fakeResult);
            _fakeProcessorProvider.Setup(f => f.GetProcessors()).Returns(_dummyCrawlProcessors);

            //Act
            DomainCrawlResult result = _uut.Consume(domain, _dummyCancellationToken);

            //Assert
            _fakeProcessorProvider.Verify(f => f.GetProcessors(), Times.Exactly(1));
            _fakeWebCrawlerFactory.Verify(f => f.CreateInstance(), Times.Exactly(1));
            _fakeWebCrawler.Verify(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>()), Times.Exactly(1));

            Assert.AreEqual(domain, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.Domain);
            Assert.AreEqual(_dummyProcessorContext.PrimaryPersistenceProvider, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.PrimaryPersistenceProvider);
            Assert.AreEqual(_dummyProcessorContext.BackupPersistenceProvider, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.BackupPersistenceProvider);
            Assert.AreEqual(_dummyCrawlProcessors, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.CrawlProcessors);
        }
Exemple #5
0
        private static Crawler CreateCrawler(CrawlContext context)
        {
            var crawler = new Crawler(context.Sequence);
            crawler.PauseInterval = context.Pause;

            if (context.QueryType == QueryType.Name)
            {
                ExtraSetupForNameBasedCrawler(crawler, context);
            }

            crawler.AfterCrawl += (sender, e) =>
            {
                var progress = e.Progress;
                var message = string.Format(GetMessageFormat(context.MaximumTry),
                    progress.Current, progress.Total, context.CancelRate, progress.CurrentKeyword, progress.Message);
                if (e.Error == null)
                {
                    PowerConsole.Info(message);
                }
                else
                {
                    PowerConsole.Error(message);
                }
            };

            return crawler;
        }
Exemple #6
0
        /// <summary>
        /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor.
        /// </summary>
        /// <param name="threadManager">Distributes http requests over multiple threads</param>
        /// <param name="scheduler">Decides what link should be crawled next</param>
        /// <param name="httpRequester">Makes the raw http requests</param>
        /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param>
        /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param>
        /// <param name="crawlConfiguration">Configurable crawl values</param>
        public WebCrawler(
            CrawlConfiguration crawlConfiguration,
            ICrawlDecisionMaker crawlDecisionMaker,
            IThreadManager threadManager,
            IScheduler scheduler,
            IPageRequester httpRequester,
            IHyperLinkParser hyperLinkParser,
            IMemoryManager memoryManager)
        {
            _crawlContext = new CrawlContext();
            _crawlContext.CrawlConfiguration = crawlConfiguration ?? GetCrawlConfigurationFromConfigFile() ?? new CrawlConfiguration();
            CrawlBag = _crawlContext.CrawlBag;

            _threadManager      = threadManager ?? new ManualThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads);
            _scheduler          = scheduler ?? new FifoScheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled);
            _httpRequester      = httpRequester ?? new PageRequester(_crawlContext.CrawlConfiguration);
            _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker();

            if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 ||
                _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0)
            {
                _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds));
            }

            _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser();

            _crawlContext.Scheduler = _scheduler;
        }
Exemple #7
0
        protected override ProcessorResult ProcessPage(CrawlContext crawlContext, CrawledPage crawledPage)
        {
            ProcessorResult result = new ProcessorResult
            {
                UniqueAttributeId = 222
            };

            Match regexResult = wordPressPattern.Match(crawledPage.RawContent);

            if (regexResult.Success)
            {
                result.Attributes.Add("siteBuilder", "BlogWordPress");
                result.IsAHit = true;
                return(result);
            }

            HtmlNodeCollection listhref = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//a[@href]") ?? new HtmlNodeCollection(null);

            if (listhref.Select(node => node.GetAttributeValue("href", "")).Any(content => content.Contains("wordpress.org")))
            {
                result.Attributes.Add("siteBuilder", "BlogWordPress");
                result.IsAHit = true;
                return(result);
            }

            return(result);
        }
        protected override ProcessorResult ProcessPage(CrawlContext crawlContext, CrawledPage crawledPage)
        {
            nodeQueryList.Add(new KeyValuePair <string, string>("login", "//a[contains(@href, 'login')]"));
            nodeQueryList.Add(new KeyValuePair <string, string>("signin", "//a[contains(@href, 'signin')]"));

            ProcessorResult result = new ProcessorResult {
                UniqueAttributeId = 17
            };

            //<input type="password"
            var pwdInputs = crawledPage.CsQueryDocument.Select("input[type='password']");

            if (pwdInputs.Length > 0)
            {
                result.IsAHit = true;
            }

            //check links
            if (!result.IsAHit)
            {
                result.IsAHit = FindTags(crawledPage, crawlContext.RootUri.DnsSafeHost.ToLower());
            }

            //if we found it, set it
            if (result.IsAHit)
            {
                result.Attributes.Add(result.UniqueAttributeId.ToString(), "true");
            }

            return(result);
        }
        public void ShouldCrawlPage_OverMaxPagesToCrawlPerDomain_ReturnsFalse()
        {
            Uri uri = new Uri("http://a.com/");
            CrawlConfiguration config = new CrawlConfiguration
            {
                MaxPagesToCrawlPerDomain = 100
            };
            ConcurrentDictionary <string, int> countByDomain = new ConcurrentDictionary <string, int>();

            countByDomain.TryAdd(uri.Authority, 100);
            CrawlContext crawlContext = new CrawlContext
            {
                CrawlConfiguration = config,
                CrawlStartDate     = DateTime.Now,
                CrawlCountByDomain = countByDomain
            };

            CrawlDecision result = _unitUnderTest.ShouldCrawlPage(
                new PageToCrawl(new Uri(uri.AbsoluteUri + "anotherpage"))
            {
                IsInternal = true
            },
                crawlContext);

            Assert.IsFalse(result.Allow);
            Assert.AreEqual("MaxPagesToCrawlPerDomain limit of [100] has been reached for domain [a.com]", result.Reason);
            Assert.IsFalse(crawlContext.IsCrawlStopRequested);
        }
Exemple #10
0
        /// <summary>
        /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor.
        /// </summary>
        /// <param name="threadManager">Distributes http requests over multiple threads</param>
        /// <param name="scheduler">Decides what link should be crawled next</param>
        /// <param name="pageRequester">Makes the raw http requests</param>
        /// <param name="htmlParser">Parses a crawled page for it's hyperlinks</param>
        /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param>
        /// <param name="crawlConfiguration">Configurable crawl values</param>
        /// <param name="memoryManager">Checks the memory usage of the host process</param>
        public WebCrawler(
            CrawlConfiguration crawlConfiguration,
            ICrawlDecisionMaker crawlDecisionMaker,
            IThreadManager threadManager,
            IScheduler scheduler,
            IPageRequester pageRequester,
            IHtmlParser htmlParser,
            IMemoryManager memoryManager)
        {
            _crawlContext = new CrawlContext
            {
                CrawlConfiguration = crawlConfiguration ?? new CrawlConfiguration()
            };
            CrawlBag = _crawlContext.CrawlBag;

            _threadManager      = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount);
            _scheduler          = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null);
            _pageRequester      = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration, new WebContentExtractor());
            _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker();

            if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0 ||
                _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0)
            {
                _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds));
            }

            _htmlParser = htmlParser ?? new AngleSharpHyperlinkParser(_crawlContext.CrawlConfiguration, null);

            _crawlContext.Scheduler = _scheduler;
        }
        public void ProcessCrawledDomain(CrawlContext crawlContext)
        {
            string webhost = string.Empty;

            try
            {
                webhost = Dig.Instance.GetWebHostName(crawlContext.RootUri.DnsSafeHost);
            }
            catch (Exception e)
            {
                _logger.ErrorFormat("Exception occurred getting webhost name for [{0}]", crawlContext.RootUri.DnsSafeHost, e);
            }

            ProcessorResult result = new ProcessorResult {
                UniqueAttributeId = ATTRIB_TYPE_ID
            };                                                                                               //mask

            result.IsAHit = webhost != "None";
            result.Attributes.Add(ATTRIB_TYPE_ID.ToString(), webhost);

            if (result.IsAHit)
            {
                DomainSave(crawlContext, result);
            }
        }
        public virtual CrawlDecision ShouldRecrawlPage(CrawledPage crawledPage, CrawlContext crawlContext)
        {
            if (crawledPage == null)
            {
                return(CrawlDecision.DisallowCrawl("Null crawled page"));
            }

            if (crawlContext == null)
            {
                return(CrawlDecision.DisallowCrawl("Null crawl context"));
            }

            if (crawledPage.Exception == null)
            {
                return(CrawlDecision.DisallowCrawl("WebException did not occur"));
            }

            if (crawlContext.CrawlConfiguration.MaxRetryCount < 1)
            {
                return(CrawlDecision.AllowCrawl("无限次重试"));
            }

            if (crawledPage.RetryCount >= crawlContext.CrawlConfiguration.MaxRetryCount)
            {
                return(CrawlDecision.DisallowCrawl("MaxRetryCount has been reached"));
            }

            return(CrawlDecision.AllowCrawl());
        }
        public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
        {
            if (pageToCrawl == null)
            {
                return(CrawlDecision.DisallowCrawl("Null crawled page"));
            }

            if (crawlContext == null)
            {
                return(CrawlDecision.DisallowCrawl("Null crawl context"));
            }

            if (pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth)
            {
                return(CrawlDecision.DisallowCrawl("Crawl depth is above max"));
            }

            if (!pageToCrawl.Uri.Scheme.StartsWith("http"))
            {
                return(CrawlDecision.DisallowCrawl("Scheme does not begin with http"));
            }

            //TODO Do we want to ignore redirect chains (ie.. do not treat them as seperate page crawls)?
            if (!pageToCrawl.IsRetry &&
                crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 &&
                crawlContext.CrawledCount > crawlContext.CrawlConfiguration.MaxPagesToCrawl)
            {
                return(CrawlDecision.DisallowCrawl(string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl)));
            }

            return(CrawlDecision.AllowCrawl());
        }
        protected void DomainSave(CrawlContext crawlContext, ProcessorResult processorResult)
        {
            ProcessorContext processorContext = crawlContext.CrawlBag.GoDaddyProcessorContext;
            DataComponent    dataComponent    = new DataComponent
            {
                ShopperId   = processorContext.Domain.ShopperId,
                AttributeId = processorResult.UniqueAttributeId,
                DomainId    = processorContext.Domain.DomainId,
                Attributes  = processorResult.Attributes,
                DomainUri   = crawlContext.RootUri,
                FoundOnUri  = null
            };

            try
            {
                processorContext.PrimaryPersistenceProvider.Save(dataComponent);
            }
            catch (Exception e)
            {
                _logger.ErrorFormat("Error while trying to save domain level data to primary IPersistenceProvider [{0}], will save to backup IPersistenceProvider [{1}]."
                                    , processorContext.PrimaryPersistenceProvider.ToString()
                                    , processorContext.BackupPersistenceProvider.ToString());
                _logger.Error(e);

                processorContext.BackupPersistenceProvider.Save(dataComponent);
            }
        }
Exemple #15
0
        private CrawlDecision ShouldCrawlPageContent(CrawledPage page, CrawlContext context)
        {
            var result = new CrawlDecision();

            if (page.Uri.ToString().Contains("product") ||
                //page.Uri.ToString().Contains("lenovo") ||
                //page.Uri.ToString().Contains("laptop") ||
                page.Uri.ToString().Contains("productVariantGroup") ||
                page.Uri.ToString().Contains("-pc"))
            {
                result.Allow = true;
                if (page.Uri.ToString().Contains("-pch"))
                {
                    result.Reason = "Not a product";
                    result.Allow  = false;
                }
            }
            else
            {
                result.Reason = "Not a product";
                result.Allow  = false;
            }

            return(result);
        }
Exemple #16
0
        public static void Main(string[] args)
        {
            try
            {
                string sequenceType = args[0];
                string start = args[1];
                long max = long.Parse(args[2]);
                int pause = int.Parse(args[3]); // Pause interval

                if (args.Length < 4)
                {
                    throw new ArgumentException("Invalid number of arguments!");
                }

                CrawlContext context = GetCrawlContext(sequenceType, start, max, pause);
                Crawler crawler = CreateCrawler(context);

                crawler.Crawl(max, context.QueryType);
            }
            catch (Exception e)
            {
                Console.WriteLine(e);
                Trace.TraceError(e.ToString());
            }
        }
Exemple #17
0
        private const string WEBSITE = "http://volarenovels.com/release-that-witch/"; //include http://

        private static CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
        {
            if (pageToCrawl.Uri.AbsoluteUri.Contains("volarenovels.com/release-that-witch/"))
            {
                String segment = pageToCrawl.Uri.Segments[pageToCrawl.Uri.Segments.Length - 1];
                if (segment.Contains("rw-chapter-"))
                {
                    segment = segment.Replace("/", "").Substring("rw-chapter-".Length);
                    if (Convert.ToInt32(segment) > 186)
                    {
                        return(new CrawlDecision {
                            Allow = true, Reason = "Is a chapter"
                        });
                    }
                }
            }
            if (pageToCrawl.Uri.AbsoluteUri == ("http://volarenovels.com/release-that-witch/"))
            {
                return new CrawlDecision {
                           Allow = true, Reason = "Is content"
                }
            }
            ;

            return(new CrawlDecision {
                Allow = false, Reason = "Is not a chapter"
            });
        }
Exemple #18
0
 public void SetUp()
 {
     _crawlContext = new CrawlContext();
     _crawlContext.CrawlConfiguration = new CrawlConfiguration {
         UserAgentString = "aaa"
     };
     _unitUnderTest = new CrawlDecisionMaker();
 }
        public PageCrawlStartingArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl)
            : base(crawlContext)
        {
            if (pageToCrawl == null)
                throw new ArgumentNullException("pageToCrawl");

            PageToCrawl = pageToCrawl;
        }
        public void Constructor_ValidArg_SetsPublicProperty()
        {
            CrawledPage  page    = new CrawledPage(new Uri("http://aaa.com/"));
            CrawlContext context = new CrawlContext();
            CrawlArgs    args    = new CrawlArgs(context);

            Assert.AreSame(context, args.CrawlContext);
        }
Exemple #21
0
        public CrawlArgs(CrawlContext crawlContext)
        {
            if (crawlContext == null)
            {
                throw new ArgumentNullException("crawlContext");
            }

            CrawlContext = crawlContext;
        }
Exemple #22
0
        static void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e)
        {
            string       childUrl     = e.PageToCrawl.Uri.AbsoluteUri;
            string       parentUrl    = e.PageToCrawl.ParentUri.AbsoluteUri;
            CrawlContext context      = e.CrawlContext;
            CrawledLinks crawledLinks = context.CrawlBag.CrawledLinks;

            crawledLinks.AddRelation(parentUrl, childUrl);
        }
Exemple #23
0
        public CrawlEventArgs(CrawlContext crawlContext)
        {
            if (crawlContext == null)
            {
                throw new ArgumentNullException(nameof(crawlContext));
            }

            CrawlContext = crawlContext;
        }
        public PageCrawlStartingArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl)
            : base(crawlContext)
        {
            if (pageToCrawl == null)
            {
                throw new ArgumentNullException("pageToCrawl");
            }

            PageToCrawl = pageToCrawl;
        }
        public PageCrawlDisallowedArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl, string disallowedReason)
            : base(crawlContext, pageToCrawl)
        {
            if (string.IsNullOrWhiteSpace(disallowedReason))
            {
                throw new ArgumentNullException(nameof(disallowedReason));
            }

            DisallowedReason = disallowedReason;
        }
        public PageLinksCrawlDisallowedArgs(CrawlContext crawlContext, CrawledPage crawledPage, string disallowedReason)
            : base(crawlContext, crawledPage)
        {
            if (string.IsNullOrWhiteSpace(disallowedReason))
            {
                throw new ArgumentNullException("disallowedReason");
            }

            DisallowedReason = disallowedReason;
        }
 public void SetUp()
 {
     _fakeScheduler = new Mock <IScheduler>();
     _crawlContext  = new CrawlContext();
     _crawlContext.CrawlConfiguration = new CrawlConfiguration {
         UserAgentString = "aaa"
     };
     _crawlContext.Scheduler = _fakeScheduler.Object;
     _unitUnderTest          = new CrawlDecisionMaker();
 }
Exemple #28
0
        public PageCrawlCompletedArgs(CrawlContext crawlContext, CrawledPage crawledPage)
            : base(crawlContext)
        {
            if (crawledPage == null)
            {
                throw new ArgumentNullException("crawledPage");
            }

            CrawledPage = crawledPage;
        }
Exemple #29
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="crawlContext">上下文</param>
        /// <param name="crawlResult">抓取结果</param>
        public CrawlCompletedArgs(CrawlContext crawlContext, CrawlResult crawlResult)
            : base(crawlContext)
        {
            if (crawlResult == null)
            {
                throw new ArgumentNullException("crawlResult");
            }

            Result = crawlResult;
        }
        public PageCrawlEventStartingEventArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl)
            : base(crawlContext)
        {
            if (pageToCrawl == null)
            {
                throw new ArgumentNullException(nameof(pageToCrawl));
            }

            PageToCrawl = pageToCrawl;
        }
Exemple #31
0
        public PageCrawlEventCompletedEventArgs(CrawlContext crawlContext, CrawledPage crawledPage)
            : base(crawlContext)
        {
            if (crawledPage == null)
            {
                throw new ArgumentNullException(nameof(crawledPage));
            }

            CrawledPage = crawledPage;
        }
Exemple #32
0
        private void ProcessCrawledPage(CrawlContext crawlContext, CrawledPage crawledPage)
        {
            if (!IsHttpStatusInConfig(crawledPage))
            {
                return;
            }

            if (!IsMimeTypesToProcessInConfig(crawledPage))
            {
                return;
            }

            int timeoutInMilliSecs = _config.MaxPageProcessorTimeInMilliSecs;
            IEnumerable <ICrawlProcessor> processors =
                crawlContext.CrawlBag.GoDaddyProcessorContext.CrawlProcessors;

            //Did not do a parallel.ForEach because it would spawn to many threads and cause heavy thrashing, most processors would take a up to 30 secs to finish
            foreach (ICrawlProcessor processor in processors)
            {
                Stopwatch timer = Stopwatch.StartNew();
                try
                {
                    processor.ProcessCrawledPage(crawlContext, crawledPage);
                    timer.Stop();

                    if (timer.ElapsedMilliseconds > timeoutInMilliSecs)
                    {
                        _logger.ErrorFormat(
                            "Crawled page processor [{0}] completed processing page [{1}] in [{2}] millisecs, which is above configuration value MaxPageProcessorTimeInMilliSecs",
                            processor.ToString(), crawledPage.Uri, timer.ElapsedMilliseconds);
                    }
                    else
                    {
                        _logger.DebugFormat(
                            "Crawled page processor [{0}] completed processing page [{1}] in [{2}] millisecs",
                            processor.ToString(), crawledPage.Uri, timer.ElapsedMilliseconds);
                    }
                }
                catch (Exception e)
                {
                    _logger.ErrorFormat(
                        "Crawled page processor [{0}] threw exception while processing page [{1}]",
                        processor.ToString(), crawledPage.Uri);
                    _logger.Error(e);
                }
                finally
                {
                    if (timer != null && timer.IsRunning)
                    {
                        timer.Stop();
                    }
                }
            }
        }
        public virtual CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext)
        {
            if (crawledPage == null)
                return new CrawlDecision{Allow = false, Reason = "Null crawled page"};

            if (crawlContext == null)
                return new CrawlDecision { Allow = false, Reason = "Null crawl context" };

            if(string.IsNullOrWhiteSpace(crawledPage.Content.Text))
                return new CrawlDecision { Allow = false, Reason = "Page has no content" };

            if (!crawlContext.CrawlConfiguration.IsExternalPageLinksCrawlingEnabled && !crawledPage.IsInternal)
                return new CrawlDecision { Allow = false, Reason = "Link is external" };

            if (crawledPage.CrawlDepth >= crawlContext.CrawlConfiguration.MaxCrawlDepth)
                return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" };

            return new CrawlDecision{Allow = true};
        }
        public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
        {
            if(pageToCrawl == null)
                return new CrawlDecision { Allow = false, Reason = "Null page to crawl" };

            if (crawlContext == null)
                return new CrawlDecision { Allow = false, Reason = "Null crawl context" };

            if (pageToCrawl.RedirectedFrom != null && pageToCrawl.RedirectPosition > crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects)
                return new CrawlDecision { Allow = false, Reason = string.Format("HttpRequestMaxAutoRedirects limit of [{0}] has been reached", crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects) };

            if(pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth)
                return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" };

            if (!pageToCrawl.Uri.Scheme.StartsWith("http"))
                return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" };

            //TODO Do we want to ignore redirect chains (ie.. do not treat them as seperate page crawls)?
            if (!pageToCrawl.IsRetry &&
                crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 &&
                crawlContext.CrawledCount + crawlContext.Scheduler.Count + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl)
            {
                return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) };
            }

            int pagesCrawledInThisDomain = 0;
            if (!pageToCrawl.IsRetry &&
                crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 &&
                crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) &&
                pagesCrawledInThisDomain > 0)
            {
                if (pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain)
                    return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) };
            }

            if(!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal)
                return new CrawlDecision { Allow = false, Reason = "Link is external" };

            return new CrawlDecision { Allow = true };
        }
        /// <summary>
        /// Creates a crawler instance with custom settings or implementation. Passing in null for all params is the equivalent of the empty constructor.
        /// </summary>
        /// <param name="threadManager">Distributes http requests over multiple threads</param>
        /// <param name="scheduler">Decides what link should be crawled next</param>
        /// <param name="pageRequester">Makes the raw http requests</param>
        /// <param name="hyperLinkParser">Parses a crawled page for it's hyperlinks</param>
        /// <param name="crawlDecisionMaker">Decides whether or not to crawl a page or that page's links</param>
        /// <param name="crawlConfiguration">Configurable crawl values</param>
        /// <param name="memoryManager">Checks the memory usage of the host process</param>
        public WebCrawler(
            CrawlConfiguration crawlConfiguration,
            ICrawlDecisionMaker crawlDecisionMaker,
            IThreadManager threadManager,
            IScheduler scheduler,
            IPageRequester pageRequester,
            IHyperLinkParser hyperLinkParser,
            IMemoryManager memoryManager)
        {
            _crawlContext = new CrawlContext();
            _crawlContext.CrawlConfiguration = crawlConfiguration ?? new CrawlConfiguration();
            CrawlBag = _crawlContext.CrawlBag;

            _threadManager = threadManager ?? new TaskThreadManager(_crawlContext.CrawlConfiguration.MaxConcurrentThreads > 0 ? _crawlContext.CrawlConfiguration.MaxConcurrentThreads : Environment.ProcessorCount);
            _scheduler = scheduler ?? new Scheduler(_crawlContext.CrawlConfiguration.IsUriRecrawlingEnabled, null, null);
            _pageRequester = pageRequester ?? new PageRequester(_crawlContext.CrawlConfiguration);
            _crawlDecisionMaker = crawlDecisionMaker ?? new CrawlDecisionMaker();

            if (_crawlContext.CrawlConfiguration.MaxMemoryUsageInMb > 0
                || _crawlContext.CrawlConfiguration.MinAvailableMemoryRequiredInMb > 0)
                _memoryManager = memoryManager ?? new MemoryManager(new CachedMemoryMonitor(new GcMemoryMonitor(), _crawlContext.CrawlConfiguration.MaxMemoryUsageCacheTimeInSeconds));

            _hyperLinkParser = hyperLinkParser ?? new HapHyperLinkParser(_crawlContext.CrawlConfiguration, null);

            _crawlContext.Scheduler = _scheduler;
        }
        public virtual CrawlDecision ShouldDownloadPageContent(CrawledPage crawledPage, CrawlContext crawlContext)
        {
            if (crawledPage == null)
                return new CrawlDecision { Allow = false, Reason = "Null crawled page" };

            if (crawlContext == null)
                return new CrawlDecision { Allow = false, Reason = "Null crawl context" };

            if (crawledPage.HttpWebResponse == null)
                return new CrawlDecision { Allow = false, Reason = "Null HttpWebResponse" };

            if (crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
                return new CrawlDecision { Allow = false, Reason = "HttpStatusCode is not 200" };

            string pageContentType = crawledPage.HttpWebResponse.ContentType.ToLower().Trim();
            bool isDownloadable = false;
            List<string> cleanDownloadableContentTypes = crawlContext.CrawlConfiguration.DownloadableContentTypes
                .Split(',')
                .Select(t => t.Trim())
                .Where(t => !string.IsNullOrEmpty(t))
                .ToList();

            foreach (string downloadableContentType in cleanDownloadableContentTypes)
            {
                if (pageContentType.Contains(downloadableContentType.ToLower().Trim()))
                {
                    isDownloadable = true;
                    break;
                }
            }
            if (!isDownloadable)
                return new CrawlDecision { Allow = false, Reason = "Content type is not any of the following: " + string.Join(",", cleanDownloadableContentTypes) };

            if (crawlContext.CrawlConfiguration.MaxPageSizeInBytes > 0 && crawledPage.HttpWebResponse.ContentLength > crawlContext.CrawlConfiguration.MaxPageSizeInBytes)
                return new CrawlDecision { Allow = false, Reason = string.Format("Page size of [{0}] bytes is above the max allowable of [{1}] bytes", crawledPage.HttpWebResponse.ContentLength, crawlContext.CrawlConfiguration.MaxPageSizeInBytes) };

            return new CrawlDecision { Allow = true };
        }
        public virtual CrawlDecision ShouldRecrawlPage(CrawledPage crawledPage, CrawlContext crawlContext)
        {
            if (crawledPage == null)
                return new CrawlDecision { Allow = false, Reason = "Null crawled page" };

            if (crawlContext == null)
                return new CrawlDecision { Allow = false, Reason = "Null crawl context" };

            if (crawledPage.WebException == null)
                return new CrawlDecision { Allow = false, Reason = "WebException did not occur"};

            if (crawlContext.CrawlConfiguration.MaxRetryCount < 1)
                return new CrawlDecision { Allow = false, Reason = "MaxRetryCount is less than 1"};

            if (crawledPage.RetryCount >= crawlContext.CrawlConfiguration.MaxRetryCount)
                return new CrawlDecision {Allow = false, Reason = "MaxRetryCount has been reached"};

            return new CrawlDecision { Allow = true };
        }