Example #1
0
 public void TearDown()
 {
     if (_unitUnderTest != null)
     {
         _unitUnderTest.Dispose();
     }
 }
Example #2
0
        public virtual CrawlResult Crawl(Uri uri, CancellationTokenSource tokenSource)
        {
            _logger.LogInformation("开始爬行");
            _crawlContext.RootUri = uri ?? throw new ArgumentNullException("uri");

            if (tokenSource != null)
            {
                _crawlContext.CancellationTokenSource = tokenSource;
            }

            _crawlResult = new CrawlResult()
            {
                Context = _crawlContext
            };

            _crawlContext.CrawlStartDate = DateTime.Now;
            Stopwatch timer = Stopwatch.StartNew();

            try
            {
                PageToCrawl rootPage = new PageToCrawl(uri)
                {
                    ParentUri = uri, IsInternal = true, IsRoot = true
                };
                if (ShouldCrawlPage(rootPage))
                {
                    _scheduler.Add(rootPage);
                }
                Crawl();
            }
            catch (Exception e)
            {
                _crawlResult.Error = e;
                _logger.LogError(e, "爬行出错");
            }
            finally
            {
                if (_threadManager != null)
                {
                    _threadManager.Dispose();
                }
            }


            timer.Stop();
            _crawlResult.Elapsed = timer.Elapsed;

            _logger.LogInformation("爬行结束 [{0}]: 共爬行 [{1}] 页面 , 花费 [{2}]", uri.AbsoluteUri, _crawlResult.Context.CrawledCount, _crawlResult.Elapsed);

            FireCrawlCompletedEventAsync(_crawlResult);

            return(_crawlResult);
        }
Example #3
0
 public virtual void Dispose()
 {
     if (_threadManager != null)
     {
         _threadManager.Dispose();
     }
     if (_scheduler != null)
     {
         _scheduler.Dispose();
     }
     if (_pageRequester != null)
     {
         _pageRequester.Dispose();
     }
     if (_memoryManager != null)
     {
         _memoryManager.Dispose();
     }
 }
        public async Task <CrawlResult> Crawl(Uri uri, CancellationTokenSource cancellationTokenSource)
        {
            if (uri == null)
            {
                throw new ArgumentNullException(nameof(uri));
            }

            _crawlContext.OriginalRootUri = uri;
            if (cancellationTokenSource != null)
            {
                _crawlContext.CancellationTokenSource = cancellationTokenSource;
            }

            _crawlResult = new CrawlResult
            {
                RootUri      = _crawlContext.OriginalRootUri,
                CrawlContext = _crawlContext
            };
            var timer = Stopwatch.StartNew();

            try
            {
                var rootPage = new PageToCrawl(uri)
                {
                    ParentUri = uri, IsInternal = true, IsRoot = true
                };
                _scheduler.Add(rootPage);
                await CrawlSite().ConfigureAwait(false);
            }
            catch (Exception e)
            {
                _crawlResult.ErrorException = e;
            }
            finally
            {
                _threadManager?.Dispose();
            }
            timer.Stop();
            _crawlResult.Elapsed = timer.Elapsed;
            return(_crawlResult);
        }
Example #5
0
        /// <summary>
        /// Begins a synchronous crawl using the uri param, subscribe to events to process data as it becomes available
        /// </summary>
        public virtual CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTokenSource)
        {
            if (uri == null)
            {
                throw new ArgumentNullException("uri");
            }

            _crawlContext.RootUri = uri;

            if (cancellationTokenSource != null)
            {
                _crawlContext.CancellationTokenSource = cancellationTokenSource;
            }

            _crawlResult              = new CrawlResult();
            _crawlResult.RootUri      = _crawlContext.RootUri;
            _crawlResult.CrawlContext = _crawlContext;
            _crawlComplete            = false;

            _logger.InfoFormat("About to crawl site [{0}]", uri.AbsoluteUri);
            PrintConfigValues(_crawlContext.CrawlConfiguration);

            if (_memoryManager != null)
            {
                _crawlContext.MemoryUsageBeforeCrawlInMb = _memoryManager.GetCurrentUsageInMb();
                _logger.InfoFormat("Starting memory usage for site [{0}] is [{1}mb]", uri.AbsoluteUri, _crawlContext.MemoryUsageBeforeCrawlInMb);
            }

            _crawlContext.CrawlStartDate = DateTime.Now;
            Stopwatch timer = Stopwatch.StartNew();

            if (_crawlContext.CrawlConfiguration.CrawlTimeoutSeconds > 0)
            {
                _timeoutTimer          = new Timer(_crawlContext.CrawlConfiguration.CrawlTimeoutSeconds * 1000);
                _timeoutTimer.Elapsed += HandleCrawlTimeout;
                _timeoutTimer.Start();
            }

            try
            {
                PageToCrawl rootPage = new PageToCrawl(uri)
                {
                    ParentUri = uri, IsInternal = true, IsRoot = true
                };
                if (ShouldSchedulePageLink(rootPage))
                {
                    _scheduler.Add(rootPage);
                }

                VerifyRequiredAvailableMemory();
                CrawlSite();
            }
            catch (Exception e)
            {
                _crawlResult.ErrorException = e;
                _logger.FatalFormat("An error occurred while crawling site [{0}]", uri);
                _logger.Fatal(e);
            }
            finally
            {
                if (_threadManager != null)
                {
                    _threadManager.Dispose();
                }
            }

            if (_timeoutTimer != null)
            {
                _timeoutTimer.Stop();
            }

            timer.Stop();

            if (_memoryManager != null)
            {
                _crawlContext.MemoryUsageAfterCrawlInMb = _memoryManager.GetCurrentUsageInMb();
                _logger.InfoFormat("Ending memory usage for site [{0}] is [{1}mb]", uri.AbsoluteUri, _crawlContext.MemoryUsageAfterCrawlInMb);
            }

            _crawlResult.Elapsed = timer.Elapsed;
            _logger.InfoFormat("Crawl complete for site [{0}]: Crawled [{1}] pages in [{2}]", _crawlResult.RootUri.AbsoluteUri, _crawlResult.CrawlContext.CrawledCount, _crawlResult.Elapsed);

            return(_crawlResult);
        }
Example #6
0
        /// <summary>
        /// Begins a synchronous crawl using the uri param, subscribe to events to process data as it becomes available
        /// </summary>
        public virtual async Task <CrawlResult> CrawlAsync(Uri uri, CancellationTokenSource cancellationTokenSource)
        {
            if (uri == null)
            {
                throw new ArgumentNullException(nameof(uri));
            }

            _crawlContext.RootUri = _crawlContext.OriginalRootUri = uri;

            if (cancellationTokenSource != null)
            {
                _crawlContext.CancellationTokenSource = cancellationTokenSource;
            }

            _crawlResult              = new CrawlResult();
            _crawlResult.RootUri      = _crawlContext.RootUri;
            _crawlResult.CrawlContext = _crawlContext;
            _crawlComplete            = false;

            _logger.LogInformation($"About to crawl site [{uri.AbsoluteUri}]");
            PrintConfigValues(_crawlContext.CrawlConfiguration);

            if (_memoryManager != null)
            {
                _crawlContext.MemoryUsageBeforeCrawlInMb = _memoryManager.GetCurrentUsageInMb();
                _logger.LogInformation($"Starting memory usage for site [{uri.AbsoluteUri}] is [{_crawlContext.MemoryUsageBeforeCrawlInMb}mb]");
            }

            _crawlContext.CrawlStartDate = DateTime.Now;
            var timer = Stopwatch.StartNew();

            if (_crawlContext.CrawlConfiguration.CrawlTimeoutSeconds > 0)
            {
                _timeoutTimer = new Timer(HandleCrawlTimeout, null, 0, _crawlContext.CrawlConfiguration.CrawlTimeoutSeconds * 1000);
            }

            try
            {
                var rootPage = new PageToCrawl(uri)
                {
                    ParentUri = uri, IsInternal = true, IsRoot = true
                };
                if (ShouldSchedulePageLink(rootPage))
                {
                    _scheduler.Add(rootPage);
                }

                VerifyRequiredAvailableMemory();
                await CrawlSite();
            }
            catch (Exception e)
            {
                _crawlResult.ErrorException = e;
                _logger.LogCritical($"An error occurred while crawling site [{uri}]", e);
            }
            finally
            {
                _threadManager?.Dispose();
            }

            _timeoutTimer?.Dispose();

            timer.Stop();

            if (_memoryManager != null)
            {
                _crawlContext.MemoryUsageAfterCrawlInMb = _memoryManager.GetCurrentUsageInMb();
                _logger.LogInformation($"Ending memory usage for site [{uri.AbsoluteUri}] is [{_crawlContext.MemoryUsageAfterCrawlInMb}mb]");
            }

            _crawlResult.Elapsed = timer.Elapsed;
            _logger.LogInformation($"Crawl complete for site [{_crawlResult.RootUri.AbsoluteUri}]: Crawled [{_crawlResult.CrawlContext.CrawledCount}] pages in [{_crawlResult.Elapsed}]");

            return(_crawlResult);
        }