public PageCrawlStartingArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl)
            : base(crawlContext)
        {
            if (pageToCrawl == null)
                throw new ArgumentNullException("pageToCrawl");

            PageToCrawl = pageToCrawl;
        }
        public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
        {
            if(pageToCrawl == null)
                return new CrawlDecision { Allow = false, Reason = "Null page to crawl" };

            if (crawlContext == null)
                return new CrawlDecision { Allow = false, Reason = "Null crawl context" };

            if (pageToCrawl.RedirectedFrom != null && pageToCrawl.RedirectPosition > crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects)
                return new CrawlDecision { Allow = false, Reason = string.Format("HttpRequestMaxAutoRedirects limit of [{0}] has been reached", crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects) };

            if(pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth)
                return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" };

            if (!pageToCrawl.Uri.Scheme.StartsWith("http"))
                return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" };

            //TODO Do we want to ignore redirect chains (ie.. do not treat them as seperate page crawls)?
            if (!pageToCrawl.IsRetry &&
                crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 &&
                crawlContext.CrawledCount + crawlContext.Scheduler.Count + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl)
            {
                return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) };
            }

            int pagesCrawledInThisDomain = 0;
            if (!pageToCrawl.IsRetry &&
                crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 &&
                crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) &&
                pagesCrawledInThisDomain > 0)
            {
                if (pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain)
                    return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) };
            }

            if(!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal)
                return new CrawlDecision { Allow = false, Reason = "Link is external" };

            return new CrawlDecision { Allow = true };
        }
Beispiel #3
0
 public void SetUp()
 {
     _unitUnderTest = GetInstance();
     _page1         = new PageToCrawl(new Uri("http://a.com"));
     _page2         = new PageToCrawl(new Uri("http://b.com"));
 }
 public void Add(PageToCrawl page)
 {
     _urlQueue.Enqueue(page);
 }
Beispiel #5
0
        void PageCrawlDisallowed(object sender, PageCrawlDisallowedArgs e)
        {
            PageToCrawl pageToCrawl = e.PageToCrawl;

            Log.Logger.Error($"Did not crawl page {pageToCrawl.Uri.AbsoluteUri} due to {e.DisallowedReason}");
        }
        static void Crawler_PageCrawlDisallowed(object sender, PageCrawlDisallowedArgs e)
        {
            PageToCrawl pageToCrawl = e.PageToCrawl;

            _log.Info($"Did not crawl page {pageToCrawl.Uri.AbsoluteUri} due to {e.DisallowedReason}");
        }
Beispiel #7
0
        /// <summary>
        /// Begins a synchronous crawl using the uri param, subscribe to events to process data as it becomes available
        /// </summary>
        public virtual CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTokenSource)
        {
            if (uri == null)
            {
                throw new ArgumentNullException("uri");
            }

            _crawlContext.RootUri = uri;

            if (cancellationTokenSource != null)
            {
                _crawlContext.CancellationTokenSource = cancellationTokenSource;
            }

            _crawlResult              = new CrawlResult();
            _crawlResult.RootUri      = _crawlContext.RootUri;
            _crawlResult.CrawlContext = _crawlContext;
            _crawlComplete            = false;

            _logger.InfoFormat("About to crawl site [{0}]", uri.AbsoluteUri);
            PrintConfigValues(_crawlContext.CrawlConfiguration);

            if (_memoryManager != null)
            {
                _crawlContext.MemoryUsageBeforeCrawlInMb = _memoryManager.GetCurrentUsageInMb();
                _logger.InfoFormat("Starting memory usage for site [{0}] is [{1}mb]", uri.AbsoluteUri, _crawlContext.MemoryUsageBeforeCrawlInMb);
            }

            _crawlContext.CrawlStartDate = DateTime.Now;
            Stopwatch timer = Stopwatch.StartNew();

            if (_crawlContext.CrawlConfiguration.CrawlTimeoutSeconds > 0)
            {
                _timeoutTimer          = new Timer(_crawlContext.CrawlConfiguration.CrawlTimeoutSeconds * 1000);
                _timeoutTimer.Elapsed += HandleCrawlTimeout;
                _timeoutTimer.Start();
            }

            try
            {
                PageToCrawl rootPage = new PageToCrawl(uri)
                {
                    ParentUri = uri, IsInternal = true, IsRoot = true
                };
                if (ShouldSchedulePageLink(rootPage))
                {
                    _scheduler.Add(rootPage);
                }

                VerifyRequiredAvailableMemory();
                CrawlSite();
            }
            catch (Exception e)
            {
                _crawlResult.ErrorException = e;
                _logger.FatalFormat("An error occurred while crawling site [{0}]", uri);
                _logger.Fatal(e);
            }
            finally
            {
                if (_threadManager != null)
                {
                    _threadManager.Dispose();
                }
            }

            if (_timeoutTimer != null)
            {
                _timeoutTimer.Stop();
            }

            timer.Stop();

            if (_memoryManager != null)
            {
                _crawlContext.MemoryUsageAfterCrawlInMb = _memoryManager.GetCurrentUsageInMb();
                _logger.InfoFormat("Ending memory usage for site [{0}] is [{1}mb]", uri.AbsoluteUri, _crawlContext.MemoryUsageAfterCrawlInMb);
            }

            _crawlResult.Elapsed = timer.Elapsed;
            _logger.InfoFormat("Crawl complete for site [{0}]: Crawled [{1}] pages in [{2}]", _crawlResult.RootUri.AbsoluteUri, _crawlResult.CrawlContext.CrawledCount, _crawlResult.Elapsed);

            return(_crawlResult);
        }
Beispiel #8
0
        //抓取页面失败
        public static void Disallowed(object sender, PageCrawlDisallowedArgs e)
        {
            PageToCrawl pageToCrawl = e.PageToCrawl;

            Outputer.Output(string.Format("由于产生错误:{1} 无法抓取页面{0}", pageToCrawl.Uri.AbsoluteUri, e.DisallowedReason));
        }
        protected virtual void WaitMinimumRetryDelay(PageToCrawl pageToCrawl)
        {
            //TODO No unit tests cover these lines
            if (pageToCrawl.LastRequest == null)
            {
                _logger.WarnFormat("pageToCrawl.LastRequest value is null for Url:{0}. Cannot retry without this value.", pageToCrawl.Uri.AbsoluteUri);
                return;
            }

            double milliSinceLastRequest = (DateTime.Now - pageToCrawl.LastRequest.Value).TotalMilliseconds;
            double milliToWait;
            if (pageToCrawl.RetryAfter.HasValue)
            {
                // Use the time to wait provided by the server instead of the config, if any.
                milliToWait = pageToCrawl.RetryAfter.Value*1000 - milliSinceLastRequest;
            }
            else
            {
                if (!(milliSinceLastRequest < _crawlContext.CrawlConfiguration.MinRetryDelayInMilliseconds)) return;
                milliToWait = _crawlContext.CrawlConfiguration.MinRetryDelayInMilliseconds - milliSinceLastRequest;
            }

            _logger.InfoFormat("Waiting [{0}] milliseconds before retrying Url:[{1}] LastRequest:[{2}] SoonestNextRequest:[{3}]",
                milliToWait,
                pageToCrawl.Uri.AbsoluteUri,
                pageToCrawl.LastRequest,
                pageToCrawl.LastRequest.Value.AddMilliseconds(_crawlContext.CrawlConfiguration.MinRetryDelayInMilliseconds));

            //TODO Cannot use RateLimiter since it currently cannot handle dynamic sleep times so using Thread.Sleep in the meantime
            if (milliToWait > 0)
                Thread.Sleep(TimeSpan.FromMilliseconds(milliToWait));
        }
        //protected virtual async Task ProcessPage(PageToCrawl pageToCrawl)
        protected virtual void ProcessPage(PageToCrawl pageToCrawl)
        {
            try
            {
                if (pageToCrawl == null)
                    return;

                ThrowIfCancellationRequested();

                AddPageToContext(pageToCrawl);

                //CrawledPage crawledPage = await CrawlThePage(pageToCrawl);
                CrawledPage crawledPage = CrawlThePage(pageToCrawl);

                // Validate the root uri in case of a redirection.
                if (crawledPage.IsRoot)
                    ValidateRootUriForRedirection(crawledPage);

                if (!IsRedirect(crawledPage) || (IsRedirect(crawledPage) && _crawlContext.CrawlConfiguration.IsHttpRequestAutoRedirectsEnabled
                    && !_scheduler.IsUriKnown(crawledPage.HttpWebResponse.ResponseUri)))
                {
                    if (PageSizeIsAboveMax(crawledPage))
                        return;

                    ThrowIfCancellationRequested();

                    bool shouldCrawlPageLinks = ShouldCrawlPageLinks(crawledPage);
                    if (shouldCrawlPageLinks || _crawlContext.CrawlConfiguration.IsForcedLinkParsingEnabled)
                        ParsePageLinks(crawledPage);

                    ThrowIfCancellationRequested();

                    if (shouldCrawlPageLinks)
                        SchedulePageLinks(crawledPage);

                    ThrowIfCancellationRequested();

                    FirePageCrawlCompletedEventAsync(crawledPage);
                    FirePageCrawlCompletedEvent(crawledPage);

                    if (ShouldRecrawlPage(crawledPage))
                    {
                        crawledPage.IsRetry = true;
                        _scheduler.Add(crawledPage);
                    }

                    if (IsRedirect(crawledPage) && _crawlContext.CrawlConfiguration.IsHttpRequestAutoRedirectsEnabled)
                    {
                        _scheduler.AddKnownUri(crawledPage.HttpWebResponse.ResponseUri);
                    }
                }
                else if (IsRedirect(crawledPage) && !_crawlContext.CrawlConfiguration.IsHttpRequestAutoRedirectsEnabled)
                {
                    ProcessRedirect(crawledPage);
                }
            }
            catch (OperationCanceledException oce)
            {
                _logger.DebugFormat("Thread cancelled while crawling/processing page [{0}]", pageToCrawl.Uri);
                throw;
            }
            catch (Exception e)
            {
                _crawlResult.ErrorException = e;
                _logger.FatalFormat("Error occurred during processing of page [{0}]", pageToCrawl.Uri);
                _logger.Fatal(e);

                _crawlContext.IsCrawlHardStopRequested = true;
            }
        }
        protected virtual void ProcessRedirect(CrawledPage crawledPage)
        {
            if (crawledPage.RedirectPosition >= 20)
                _logger.WarnFormat("Page [{0}] is part of a chain of 20 or more consecutive redirects, redirects for this chain will now be aborted.", crawledPage.Uri);

            try
            {
                var uri = ExtractRedirectUri(crawledPage);

                PageToCrawl page = new PageToCrawl(uri);
                page.ParentUri = crawledPage.ParentUri;
                page.CrawlDepth = crawledPage.CrawlDepth;
                page.IsInternal = IsInternalUri(uri);
                page.IsRoot = false;
                page.RedirectedFrom = crawledPage;
                page.RedirectPosition = crawledPage.RedirectPosition + 1;

                crawledPage.RedirectedTo = page;
                _logger.DebugFormat("Page [{0}] is requesting that it be redirect to [{1}]", crawledPage.Uri, crawledPage.RedirectedTo.Uri);

                if (ShouldSchedulePageLink(page))
                {
                    if (_scheduler.IsUriKnown(uri))
                    {
                        _logger.InfoFormat("Page [{0}] is redirected to [{1}], which is a page already crawled.", crawledPage.Uri, crawledPage.RedirectedTo.Uri);
                    }
                    else
                    {
                        _logger.InfoFormat("Page [{0}] will be redirect to [{1}]", crawledPage.Uri, crawledPage.RedirectedTo.Uri);
                        _scheduler.Add(page);
                    }
                }
            }
            catch {}
        }
 protected virtual void FirePageCrawlStartingEvent(PageToCrawl pageToCrawl)
 {
     try
     {
         EventHandler<PageCrawlStartingArgs> threadSafeEvent = PageCrawlStarting;
         if (threadSafeEvent != null)
             threadSafeEvent(this, new PageCrawlStartingArgs(_crawlContext, pageToCrawl));
     }
     catch (Exception e)
     {
         _logger.Error("An unhandled exception was thrown by a subscriber of the PageCrawlStarting event for url:" + pageToCrawl.Uri.AbsoluteUri);
         _logger.Error(e);
     }
 }
 protected virtual void FirePageCrawlStartingEventAsync(PageToCrawl pageToCrawl)
 {
     EventHandler<PageCrawlStartingArgs> threadSafeEvent = PageCrawlStartingAsync;
     if (threadSafeEvent != null)
     {
         //Fire each subscribers delegate async
         foreach (EventHandler<PageCrawlStartingArgs> del in threadSafeEvent.GetInvocationList())
         {
             del.BeginInvoke(this, new PageCrawlStartingArgs(_crawlContext, pageToCrawl), null, null);
         }
     }
 }
        //protected virtual async Task<CrawledPage> CrawlThePage(PageToCrawl pageToCrawl)
        protected virtual CrawledPage CrawlThePage(PageToCrawl pageToCrawl)
        {
            _logger.DebugFormat("About to crawl page [{0}]", pageToCrawl.Uri.AbsoluteUri);
            FirePageCrawlStartingEventAsync(pageToCrawl);
            FirePageCrawlStartingEvent(pageToCrawl);

            if (pageToCrawl.IsRetry){ WaitMinimumRetryDelay(pageToCrawl); }

            pageToCrawl.LastRequest = DateTime.Now;

            CrawledPage crawledPage = _pageRequester.MakeRequest(pageToCrawl.Uri, ShouldDownloadPageContent);
            //CrawledPage crawledPage = await _pageRequester.MakeRequestAsync(pageToCrawl.Uri, ShouldDownloadPageContent);

            dynamic combinedPageBag = this.CombinePageBags(pageToCrawl.PageBag, crawledPage.PageBag);
            Mapper.CreateMap<PageToCrawl, CrawledPage>();
            Mapper.Map(pageToCrawl, crawledPage);
            crawledPage.PageBag = combinedPageBag;

            if (crawledPage.HttpWebResponse == null)
                _logger.InfoFormat("Page crawl complete, Status:[NA] Url:[{0}] Elapsed:[{1}] Parent:[{2}] Retry:[{3}]", crawledPage.Uri.AbsoluteUri, crawledPage.Elapsed, crawledPage.ParentUri, crawledPage.RetryCount);
            else
                _logger.InfoFormat("Page crawl complete, Status:[{0}] Url:[{1}] Elapsed:[{2}] Parent:[{3}] Retry:[{4}]", Convert.ToInt32(crawledPage.HttpWebResponse.StatusCode), crawledPage.Uri.AbsoluteUri, crawledPage.Elapsed, crawledPage.ParentUri, crawledPage.RetryCount);

            return crawledPage;
        }
        protected virtual void AddPageToContext(PageToCrawl pageToCrawl)
        {
            if (pageToCrawl.IsRetry)
            {
                pageToCrawl.RetryCount++;
                return;
            }

            int domainCount = 0;
            Interlocked.Increment(ref _crawlContext.CrawledCount);
            lock (_crawlContext.CrawlCountByDomain)
            {
                if (_crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out domainCount))
                    _crawlContext.CrawlCountByDomain[pageToCrawl.Uri.Authority] = domainCount + 1;
                else
                    _crawlContext.CrawlCountByDomain.TryAdd(pageToCrawl.Uri.Authority, 1);
            }
        }
        /// <summary>
        /// Begins a synchronous crawl using the uri param, subscribe to events to process data as it becomes available
        /// </summary>
        public virtual CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTokenSource)
        {
            if (uri == null)
                throw new ArgumentNullException("uri");

            _crawlContext.RootUri = _crawlContext.OriginalRootUri = uri;

            if (cancellationTokenSource != null)
                _crawlContext.CancellationTokenSource = cancellationTokenSource;

            _crawlResult = new CrawlResult();
            _crawlResult.RootUri = _crawlContext.RootUri;
            _crawlResult.CrawlContext = _crawlContext;
            _crawlComplete = false;

            _logger.InfoFormat("About to crawl site [{0}]", uri.AbsoluteUri);
            PrintConfigValues(_crawlContext.CrawlConfiguration);

            if (_memoryManager != null)
            {
                _crawlContext.MemoryUsageBeforeCrawlInMb = _memoryManager.GetCurrentUsageInMb();
                _logger.InfoFormat("Starting memory usage for site [{0}] is [{1}mb]", uri.AbsoluteUri, _crawlContext.MemoryUsageBeforeCrawlInMb);
            }

            _crawlContext.CrawlStartDate = DateTime.Now;
            Stopwatch timer = Stopwatch.StartNew();

            if (_crawlContext.CrawlConfiguration.CrawlTimeoutSeconds > 0)
            {
                _timeoutTimer = new Timer(_crawlContext.CrawlConfiguration.CrawlTimeoutSeconds * 1000);
                _timeoutTimer.Elapsed += HandleCrawlTimeout;
                _timeoutTimer.Start();
            }

            try
            {
                PageToCrawl rootPage = new PageToCrawl(uri) { ParentUri = uri, IsInternal = true, IsRoot = true };
                if (ShouldSchedulePageLink(rootPage))
                    _scheduler.Add(rootPage);

                VerifyRequiredAvailableMemory();
                CrawlSite();
            }
            catch (Exception e)
            {
                _crawlResult.ErrorException = e;
                _logger.FatalFormat("An error occurred while crawling site [{0}]", uri);
                _logger.Fatal(e);
            }
            finally
            {
                if (_threadManager != null)
                    _threadManager.Dispose();
            }

            if (_timeoutTimer != null)
                _timeoutTimer.Stop();

            timer.Stop();

            if (_memoryManager != null)
            {
                _crawlContext.MemoryUsageAfterCrawlInMb = _memoryManager.GetCurrentUsageInMb();
                _logger.InfoFormat("Ending memory usage for site [{0}] is [{1}mb]", uri.AbsoluteUri, _crawlContext.MemoryUsageAfterCrawlInMb);
            }

            _crawlResult.Elapsed = timer.Elapsed;
            _logger.InfoFormat("Crawl complete for site [{0}]: Crawled [{1}] pages in [{2}]", _crawlResult.RootUri.AbsoluteUri, _crawlResult.CrawlContext.CrawledCount, _crawlResult.Elapsed);

            return _crawlResult;
        }
Beispiel #17
0
        static void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e)
        {
            PageToCrawl pageToCrawl = e.PageToCrawl;

            Console.WriteLine("About to crawl link {0} which was found on page {1}", pageToCrawl.Uri.AbsoluteUri, pageToCrawl.ParentUri.AbsoluteUri);
        }
 public override CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
 {
     return(base.ShouldCrawlPage(pageToCrawl, crawlContext));
 }
 private void _crawler_PageCrawlStartingAsync(object sender, PageCrawlStartingArgs e)
 {
     PageToCrawl pageToCrawl = e.PageToCrawl;
 }
Beispiel #20
0
        public IEnumerable <PageToCrawl> GetLinks(CrawledPage crawledPage)
        {
            List <PageToCrawl> pages = new List <PageToCrawl>();
            List <string>      rules = GetDynamicData <List <string> >(crawledPage.PageBag, "Rules");

            if (rules == null)
            {
                rules = _thinkCrawlConfiguration.StartRule.GetRules();
            }

            var doc = crawledPage.HtmlDocument;

            foreach (var ruleName in rules)
            {
                var rule = _thinkCrawlConfiguration.GetRule(ruleName);
                if (rule == null)
                {
                    continue;
                }

                string xpath = rule.XPath;
                if (string.IsNullOrEmpty(xpath) || !rule.NeedCrawl)
                {
                    continue;
                }
                #region add link

                bool donotFilter = string.IsNullOrEmpty(rule.FilterText);
                var  links       = doc.DocumentNode.SelectNodes(xpath);

                if (links == null || links.Count < 1)
                {
                    continue;
                }

                foreach (var a in links)
                {
                    string text = StringUtil.RemoveHTML(a.InnerText);
                    string url  = FormatUrl(a.Attributes["href"].Value ?? "", crawledPage.Uri);
                    if (string.IsNullOrEmpty(url))
                    {
                        continue;
                    }

                    if (donotFilter || text == rule.FilterText)
                    {
                        _logger.LogInformation("文本:[{0}] , URL: {1}", text, url);

                        var uri = new Uri(url);

                        PageToCrawl page = new PageToCrawl(uri)
                        {
                            ParentUri  = crawledPage.Uri,
                            CrawlDepth = crawledPage.CrawlDepth + 1,
                            IsInternal = uri.Authority == crawledPage.Uri.Authority,
                            IsRoot     = false
                        };

                        page.PageBag.Rules = rule.GetNextRules();
                        pages.Add(page);
                    }
                }
                #endregion
            }
            return(pages);
        }
        protected virtual void SchedulePageLinks(CrawledPage crawledPage)
        {
            foreach (Uri uri in crawledPage.ParsedLinks)
            {
                // First validate that the link was not already visited or added to the list of pages to visit, so we don't
                // make the same validation and fire the same events twice.
                if (!_scheduler.IsUriKnown(uri) &&
                    (_shouldScheduleLinkDecisionMaker == null || _shouldScheduleLinkDecisionMaker.Invoke(uri, crawledPage, _crawlContext))) {
                    try //Added due to a bug in the Uri class related to this (http://stackoverflow.com/questions/2814951/system-uriformatexception-invalid-uri-the-hostname-could-not-be-parsed)
                    {
                        PageToCrawl page = new PageToCrawl(uri);
                        page.ParentUri = crawledPage.Uri;
                        page.CrawlDepth = crawledPage.CrawlDepth + 1;
                        page.IsInternal = IsInternalUri(uri);
                        page.IsRoot = false;

                        if (ShouldSchedulePageLink(page))
                        {
                            _scheduler.Add(page);
                        }
                    }
                    catch { }
                }

                // Add this link to the list of known Urls so validations are not duplicated in the future.
                _scheduler.AddKnownUri(uri);
            }
        }
        protected virtual bool ShouldSchedulePageLink(PageToCrawl page)
        {
            if ((page.IsInternal || _crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled) && (ShouldCrawlPage(page)))
                return true;

            return false;
        }
Beispiel #23
0
        public void Add_NullPage()
        {
            PageToCrawl nullPage = null;

            _unitUnderTest.Add(nullPage);
        }
        protected virtual bool ShouldCrawlPage(PageToCrawl pageToCrawl)
        {
            CrawlDecision shouldCrawlPageDecision = _crawlDecisionMaker.ShouldCrawlPage(pageToCrawl, _crawlContext);
            if (shouldCrawlPageDecision.Allow)
                shouldCrawlPageDecision = (_shouldCrawlPageDecisionMaker != null) ? _shouldCrawlPageDecisionMaker.Invoke(pageToCrawl, _crawlContext) : new CrawlDecision { Allow = true };

            if (!shouldCrawlPageDecision.Allow)
            {
                _logger.DebugFormat("Page [{0}] not crawled, [{1}]", pageToCrawl.Uri.AbsoluteUri, shouldCrawlPageDecision.Reason);
                FirePageCrawlDisallowedEventAsync(pageToCrawl, shouldCrawlPageDecision.Reason);
                FirePageCrawlDisallowedEvent(pageToCrawl, shouldCrawlPageDecision.Reason);
            }

            SignalCrawlStopIfNeeded(shouldCrawlPageDecision);
            return shouldCrawlPageDecision.Allow;
        }
Beispiel #25
0
        static void Crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e)
        {
            PageToCrawl pageToCrawl = e.PageToCrawl;

            _log.Info($"About to crawl link {pageToCrawl.Uri.AbsoluteUri} which was found on page {pageToCrawl.ParentUri.AbsoluteUri}");
        }
Beispiel #26
0
 private void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e)
 {
     PageToCrawl pageToCrawl = e.PageToCrawl;
     var         result      = string.Format("About to crawl link {0} which was found on page {1}", pageToCrawl.Uri.AbsoluteUri, pageToCrawl.ParentUri.AbsoluteUri);
 }
Beispiel #27
0
        protected virtual async Task ProcessPage(PageToCrawl pageToCrawl)
        {
            lock (_processingPageCountLock)
            {
                _processingPageCount++;
                Log.Debug($"Incrementing processingPageCount to [{_processingPageCount}]");
            }

            try
            {
                if (pageToCrawl == null)
                {
                    return;
                }

                ThrowIfCancellationRequested();

                AddPageToContext(pageToCrawl);

                var crawledPage = await CrawlThePage(pageToCrawl).ConfigureAwait(false);

                // Validate the root uri in case of a redirection.
                if (crawledPage.IsRoot)
                {
                    ValidateRootUriForRedirection(crawledPage);
                }

                if (IsRedirect(crawledPage) && !_crawlContext.CrawlConfiguration.IsHttpRequestAutoRedirectsEnabled)
                {
                    ProcessRedirect(crawledPage);
                }

                if (PageSizeIsAboveMax(crawledPage))
                {
                    return;
                }

                ThrowIfCancellationRequested();

                var shouldCrawlPageLinks = ShouldCrawlPageLinks(crawledPage);
                if (shouldCrawlPageLinks || _crawlContext.CrawlConfiguration.IsForcedLinkParsingEnabled)
                {
                    ParsePageLinks(crawledPage);
                }

                ThrowIfCancellationRequested();

                if (shouldCrawlPageLinks)
                {
                    SchedulePageLinks(crawledPage);
                }

                ThrowIfCancellationRequested();

                FirePageCrawlCompletedEvent(crawledPage);

                if (ShouldRecrawlPage(crawledPage))
                {
                    crawledPage.IsRetry = true;
                    _scheduler.Add(crawledPage);
                }
            }
            catch (OperationCanceledException)
            {
                Log.Debug("Thread cancelled while crawling/processing page [{0}]", pageToCrawl.Uri);
                throw;
            }
            catch (Exception e)
            {
                _crawlResult.ErrorException = e;
                Log.Fatal("Error occurred during processing of page [{0}]", pageToCrawl.Uri);
                Log.Fatal(e, "Exception details -->");

                _crawlContext.IsCrawlHardStopRequested = true;
            }
            finally
            {
                lock (_processingPageCountLock)
                {
                    _processingPageCount--;
                    Log.Debug($"Decrementing processingPageCount to [{_processingPageCount}]");
                }
            }
        }
Beispiel #28
0
        private void crawler_CrawlerStart(object sender, PageCrawlStartingArgs e)
        {
            PageToCrawl page = e.PageToCrawl;

            Console.WriteLine("Starting with {0}", page.Uri.ToString());
        }
Beispiel #29
0
        //protected virtual async Task ProcessPage(PageToCrawl pageToCrawl)
        protected virtual void ProcessPage(PageToCrawl pageToCrawl)
        {
            try
            {
                if (pageToCrawl == null)
                {
                    return;
                }

                ThrowIfCancellationRequested();

                AddPageToContext(pageToCrawl);

                //CrawledPage crawledPage = await CrawlThePage(pageToCrawl);
                CrawledPage crawledPage = CrawlThePage(pageToCrawl);

                if (IsRedirect(crawledPage))
                {
                    ProcessRedirect(crawledPage);
                }

                if (PageSizeIsAboveMax(crawledPage))
                {
                    return;
                }

                ThrowIfCancellationRequested();

                bool shouldCrawlPageLinks = ShouldCrawlPageLinks(crawledPage);
                if (shouldCrawlPageLinks || _crawlContext.CrawlConfiguration.IsForcedLinkParsingEnabled)
                {
                    ParsePageLinks(crawledPage);
                }

                ThrowIfCancellationRequested();

                if (shouldCrawlPageLinks)
                {
                    SchedulePageLinks(crawledPage);
                }

                ThrowIfCancellationRequested();

                FirePageCrawlCompletedEventAsync(crawledPage);
                FirePageCrawlCompletedEvent(crawledPage);

                if (ShouldRecrawlPage(crawledPage))
                {
                    crawledPage.IsRetry = true;
                    _scheduler.Add(crawledPage);
                }
            }
            catch (OperationCanceledException oce)
            {
                _logger.DebugFormat("Thread cancelled while crawling/processing page [{0}]", pageToCrawl.Uri);
                throw;
            }
            catch (Exception e)
            {
                _crawlResult.ErrorException = e;
                _logger.FatalFormat("Error occurred during processing of page [{0}]", pageToCrawl.Uri);
                _logger.Fatal(e);

                _crawlContext.IsCrawlHardStopRequested = true;
            }
        }
Beispiel #30
0
        private void crawler_CrawlerDisalowed(object sender, PageCrawlDisallowedArgs e)
        {
            PageToCrawl page = e.PageToCrawl;

            Console.WriteLine("Disallowed: {0}", page.Uri.ToString());
        }
Beispiel #31
0
        public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
        {
            if (pageToCrawl == null)
            {
                return new CrawlDecision {
                           Allow = false, Reason = "Null page to crawl"
                }
            }
            ;

            if (crawlContext == null)
            {
                return new CrawlDecision {
                           Allow = false, Reason = "Null crawl context"
                }
            }
            ;

            if (pageToCrawl.RedirectedFrom != null && pageToCrawl.RedirectPosition > crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects)
            {
                return new CrawlDecision {
                           Allow = false, Reason = string.Format("HttpRequestMaxAutoRedirects limit of [{0}] has been reached", crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects)
                }
            }
            ;

            if (pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth)
            {
                return new CrawlDecision {
                           Allow = false, Reason = "Crawl depth is above max"
                }
            }
            ;

            if (!pageToCrawl.Uri.Scheme.StartsWith("http"))
            {
                return new CrawlDecision {
                           Allow = false, Reason = "Scheme does not begin with http"
                }
            }
            ;

            //TODO Do we want to ignore redirect chains (ie.. do not treat them as seperate page crawls)?
            if (!pageToCrawl.IsRetry &&
                crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 &&
                crawlContext.CrawledCount + crawlContext.Scheduler.Count + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl)
            {
                return(new CrawlDecision {
                    Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl)
                });
            }

            int pagesCrawledInThisDomain = 0;

            if (!pageToCrawl.IsRetry &&
                crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 &&
                crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) &&
                pagesCrawledInThisDomain > 0)
            {
                if (pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain)
                {
                    return new CrawlDecision {
                               Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority)
                    }
                }
                ;
            }

            if (!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal)
            {
                return new CrawlDecision {
                           Allow = false, Reason = "Link is external"
                }
            }
            ;

            return(new CrawlDecision {
                Allow = true
            });
        }
        /// <summary>
        /// 不爬取这个页面的原因
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        void crawler_PageCrawlDisallowed(object sender, PageCrawlDisallowedArgs e)
        {
            PageToCrawl pageToCrawl = e.PageToCrawl;

            log.Info("不爬取此页面 " + pageToCrawl.Uri.AbsoluteUri + " 其原因为 " + e.DisallowedReason);
        }
Beispiel #33
0
        void ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e)
        {
            PageToCrawl pageToCrawl = e.PageToCrawl;

            Log.Logger.Debug($"About to crawl link {pageToCrawl.Uri.AbsoluteUri} which was found on page {pageToCrawl.ParentUri.AbsoluteUri}");
        }
        /// <summary>
        /// 设置爬虫开始爬行
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e)
        {
            PageToCrawl pageToCrawl = e.PageToCrawl;

            log.Info("要爬取的链接 " + pageToCrawl.Uri.AbsoluteUri + " 在页面 " + pageToCrawl.ParentUri.AbsoluteUri);
        }
Beispiel #35
0
        void crawler_PageCrawlDisallowed(object sender, PageCrawlDisallowedArgs e)
        {
            PageToCrawl pageToCrawl = e.PageToCrawl;

            Console.WriteLine("Did not crawl page {0} due to {1}", pageToCrawl.Uri.AbsoluteUri, e.DisallowedReason);
        }
Beispiel #36
0
        public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
        {
            if (pageToCrawl == null)
            {
                return new CrawlDecision {
                           Allow = false, Reason = "Null page to crawl"
                }
            }
            ;

            if (crawlContext == null)
            {
                return new CrawlDecision {
                           Allow = false, Reason = "Null crawl context"
                }
            }
            ;

            if (pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth)
            {
                return new CrawlDecision {
                           Allow = false, Reason = "Crawl depth is above max"
                }
            }
            ;

            if (!pageToCrawl.Uri.Scheme.StartsWith("http"))
            {
                return new CrawlDecision {
                           Allow = false, Reason = "Scheme does not begin with http"
                }
            }
            ;

            if (!pageToCrawl.IsRetry &&
                crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 &&
                crawlContext.CrawledCount + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl)
            {
                return(new CrawlDecision {
                    Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl)
                });
            }

            int pagesCrawledInThisDomain = 0;

            if (!pageToCrawl.IsRetry &&
                crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 &&
                crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) &&
                pagesCrawledInThisDomain > 0)
            {
                if (pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain)
                {
                    return new CrawlDecision {
                               Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority)
                    }
                }
                ;
            }

            if (!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal)
            {
                return new CrawlDecision {
                           Allow = false, Reason = "Link is external"
                }
            }
            ;

            return(new CrawlDecision {
                Allow = true
            });
        }
Beispiel #37
0
        //开始抓取
        public static void Starting(object sender, PageCrawlStartingArgs e)
        {
            PageToCrawl pageToCrawl = e.PageToCrawl;

            Outputer.Output(string.Format("关于抓取页面 {0} 上找到的链接 {1}", pageToCrawl.Uri.AbsoluteUri, pageToCrawl.ParentUri.AbsoluteUri));
        }
Beispiel #38
0
        protected virtual List <PageToCrawl> GetCrawlPages(CrawledPage crawledPage, IEnumerable <string> hrefValues)
        {
            List <PageToCrawl> pages = new List <PageToCrawl>();

            if (hrefValues == null || hrefValues.Count() < 1)
            {
                return(pages);
            }

            //Use the uri of the page that actually responded to the request instead of crawledPage.Uri (Issue 82).
            //Using HttpWebRequest.Address instead of HttpWebResonse.ResponseUri since this is the best practice and mentioned on http://msdn.microsoft.com/en-us/library/system.net.httpwebresponse.responseuri.aspx
            Uri uriToUse = crawledPage.Uri;

            //If html base tag exists use it instead of page uri for relative links
            string baseHref = GetBaseHrefValue(crawledPage);

            if (!string.IsNullOrEmpty(baseHref))
            {
                if (baseHref.StartsWith("//"))
                {
                    baseHref = crawledPage.Uri.Scheme + ":" + baseHref;
                }

                try
                {
                    uriToUse = new Uri(baseHref);
                }
                catch { }
            }

            string href = "";

            foreach (string hrefValue in hrefValues)
            {
                try
                {
                    // Remove the url fragment part of the url if needed.
                    // This is the part after the # and is often not useful.
                    href = hrefValue.Split('#')[0];
                    Uri newUri = new Uri(uriToUse, href);

                    if (_cleanURLFunc != null)
                    {
                        newUri = new Uri(_cleanURLFunc(newUri.AbsoluteUri));
                    }

                    if (!pages.Exists(u => u.Uri.AbsoluteUri == newUri.AbsoluteUri))
                    {
                        PageToCrawl page = new PageToCrawl(newUri)
                        {
                            ParentUri  = crawledPage.Uri,
                            CrawlDepth = crawledPage.CrawlDepth + 1,
                            IsInternal = newUri.Authority == crawledPage.Uri.Authority,
                            IsRoot     = false
                        };
                        pages.Add(page);
                    }
                }
                catch (Exception e)
                {
                    _logger.LogDebug("Could not parse link [{0}] on page [{1}]", hrefValue, crawledPage.Uri);
                    _logger.LogError(e, e.Message);
                }
            }

            return(pages);
        }