Exemple #1
0
        /// <summary>
        /// Begins a synchronous crawl using the uri param, subscribe to events to process data as it becomes available
        /// </summary>
        public virtual CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTokenSource)
        {
            if (uri == null)
            {
                throw new ArgumentNullException("uri");
            }

            _crawlContext.RootUri = uri;

            if (cancellationTokenSource != null)
            {
                _crawlContext.CancellationTokenSource = cancellationTokenSource;
            }

            _crawlResult              = new CrawlResult();
            _crawlResult.RootUri      = _crawlContext.RootUri;
            _crawlResult.CrawlContext = _crawlContext;
            _crawlComplete            = false;

            _logger.InfoFormat("About to crawl site [{0}]", uri.AbsoluteUri);
            PrintConfigValues(_crawlContext.CrawlConfiguration);

            if (_memoryManager != null)
            {
                _crawlContext.MemoryUsageBeforeCrawlInMb = _memoryManager.GetCurrentUsageInMb();
                _logger.InfoFormat("Starting memory usage for site [{0}] is [{1}mb]", uri.AbsoluteUri, _crawlContext.MemoryUsageBeforeCrawlInMb);
            }

            _crawlContext.CrawlStartDate = DateTime.Now;
            Stopwatch timer = Stopwatch.StartNew();

            if (_crawlContext.CrawlConfiguration.CrawlTimeoutSeconds > 0)
            {
                _timeoutTimer          = new Timer(_crawlContext.CrawlConfiguration.CrawlTimeoutSeconds * 1000);
                _timeoutTimer.Elapsed += HandleCrawlTimeout;
                _timeoutTimer.Start();
            }

            try
            {
                PageToCrawl rootPage = new PageToCrawl(uri)
                {
                    ParentUri = uri, IsInternal = true, IsRoot = true
                };
                if (ShouldSchedulePageLink(rootPage))
                {
                    _scheduler.Add(rootPage);
                }

                VerifyRequiredAvailableMemory();
                CrawlSite();
            }
            catch (Exception e)
            {
                _crawlResult.ErrorException = e;
                _logger.FatalFormat("An error occurred while crawling site [{0}]", uri);
                _logger.Fatal(e);
            }
            finally
            {
                if (_threadManager != null)
                {
                    _threadManager.Dispose();
                }
            }

            if (_timeoutTimer != null)
            {
                _timeoutTimer.Stop();
            }

            timer.Stop();

            if (_memoryManager != null)
            {
                _crawlContext.MemoryUsageAfterCrawlInMb = _memoryManager.GetCurrentUsageInMb();
                _logger.InfoFormat("Ending memory usage for site [{0}] is [{1}mb]", uri.AbsoluteUri, _crawlContext.MemoryUsageAfterCrawlInMb);
            }

            _crawlResult.Elapsed = timer.Elapsed;
            _logger.InfoFormat("Crawl complete for site [{0}]: Crawled [{1}] pages in [{2}]", _crawlResult.RootUri.AbsoluteUri, _crawlResult.CrawlContext.CrawledCount, _crawlResult.Elapsed);

            return(_crawlResult);
        }
        /// <summary>
        /// Begins a synchronous crawl using the uri param, subscribe to events to process data as it becomes available
        /// </summary>
        public virtual async Task <CrawlResult> CrawlAsync(Uri uri, CancellationTokenSource cancellationTokenSource)
        {
            if (uri == null)
            {
                throw new ArgumentNullException(nameof(uri));
            }

            _crawlContext.RootUri = _crawlContext.OriginalRootUri = uri;

            if (cancellationTokenSource != null)
            {
                _crawlContext.CancellationTokenSource = cancellationTokenSource;
            }

            _crawlResult              = new CrawlResult();
            _crawlResult.RootUri      = _crawlContext.RootUri;
            _crawlResult.CrawlContext = _crawlContext;
            _crawlComplete            = false;

            _logger.LogInformation($"About to crawl site [{uri.AbsoluteUri}]");
            PrintConfigValues(_crawlContext.CrawlConfiguration);

            if (_memoryManager != null)
            {
                _crawlContext.MemoryUsageBeforeCrawlInMb = _memoryManager.GetCurrentUsageInMb();
                _logger.LogInformation($"Starting memory usage for site [{uri.AbsoluteUri}] is [{_crawlContext.MemoryUsageBeforeCrawlInMb}mb]");
            }

            _crawlContext.CrawlStartDate = DateTime.Now;
            var timer = Stopwatch.StartNew();

            if (_crawlContext.CrawlConfiguration.CrawlTimeoutSeconds > 0)
            {
                _timeoutTimer = new Timer(HandleCrawlTimeout, null, 0, _crawlContext.CrawlConfiguration.CrawlTimeoutSeconds * 1000);
            }

            try
            {
                var rootPage = new PageToCrawl(uri)
                {
                    ParentUri = uri, IsInternal = true, IsRoot = true
                };
                if (ShouldSchedulePageLink(rootPage))
                {
                    _scheduler.Add(rootPage);
                }

                VerifyRequiredAvailableMemory();
                await CrawlSite();
            }
            catch (Exception e)
            {
                _crawlResult.ErrorException = e;
                _logger.LogCritical($"An error occurred while crawling site [{uri}]", e);
            }
            finally
            {
                _threadManager?.Dispose();
            }

            _timeoutTimer?.Dispose();

            timer.Stop();

            if (_memoryManager != null)
            {
                _crawlContext.MemoryUsageAfterCrawlInMb = _memoryManager.GetCurrentUsageInMb();
                _logger.LogInformation($"Ending memory usage for site [{uri.AbsoluteUri}] is [{_crawlContext.MemoryUsageAfterCrawlInMb}mb]");
            }

            _crawlResult.Elapsed = timer.Elapsed;
            _logger.LogInformation($"Crawl complete for site [{_crawlResult.RootUri.AbsoluteUri}]: Crawled [{_crawlResult.CrawlContext.CrawledCount}] pages in [{_crawlResult.Elapsed}]");

            return(_crawlResult);
        }