public void TearDown() { if (_unitUnderTest != null) { _unitUnderTest.Dispose(); } }
public virtual CrawlResult Crawl(Uri uri, CancellationTokenSource tokenSource) { _logger.LogInformation("开始爬行"); _crawlContext.RootUri = uri ?? throw new ArgumentNullException("uri"); if (tokenSource != null) { _crawlContext.CancellationTokenSource = tokenSource; } _crawlResult = new CrawlResult() { Context = _crawlContext }; _crawlContext.CrawlStartDate = DateTime.Now; Stopwatch timer = Stopwatch.StartNew(); try { PageToCrawl rootPage = new PageToCrawl(uri) { ParentUri = uri, IsInternal = true, IsRoot = true }; if (ShouldCrawlPage(rootPage)) { _scheduler.Add(rootPage); } Crawl(); } catch (Exception e) { _crawlResult.Error = e; _logger.LogError(e, "爬行出错"); } finally { if (_threadManager != null) { _threadManager.Dispose(); } } timer.Stop(); _crawlResult.Elapsed = timer.Elapsed; _logger.LogInformation("爬行结束 [{0}]: 共爬行 [{1}] 页面 , 花费 [{2}]", uri.AbsoluteUri, _crawlResult.Context.CrawledCount, _crawlResult.Elapsed); FireCrawlCompletedEventAsync(_crawlResult); return(_crawlResult); }
public virtual void Dispose() { if (_threadManager != null) { _threadManager.Dispose(); } if (_scheduler != null) { _scheduler.Dispose(); } if (_pageRequester != null) { _pageRequester.Dispose(); } if (_memoryManager != null) { _memoryManager.Dispose(); } }
public async Task <CrawlResult> Crawl(Uri uri, CancellationTokenSource cancellationTokenSource) { if (uri == null) { throw new ArgumentNullException(nameof(uri)); } _crawlContext.OriginalRootUri = uri; if (cancellationTokenSource != null) { _crawlContext.CancellationTokenSource = cancellationTokenSource; } _crawlResult = new CrawlResult { RootUri = _crawlContext.OriginalRootUri, CrawlContext = _crawlContext }; var timer = Stopwatch.StartNew(); try { var rootPage = new PageToCrawl(uri) { ParentUri = uri, IsInternal = true, IsRoot = true }; _scheduler.Add(rootPage); await CrawlSite().ConfigureAwait(false); } catch (Exception e) { _crawlResult.ErrorException = e; } finally { _threadManager?.Dispose(); } timer.Stop(); _crawlResult.Elapsed = timer.Elapsed; return(_crawlResult); }
/// <summary> /// Begins a synchronous crawl using the uri param, subscribe to events to process data as it becomes available /// </summary> public virtual CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTokenSource) { if (uri == null) { throw new ArgumentNullException("uri"); } _crawlContext.RootUri = uri; if (cancellationTokenSource != null) { _crawlContext.CancellationTokenSource = cancellationTokenSource; } _crawlResult = new CrawlResult(); _crawlResult.RootUri = _crawlContext.RootUri; _crawlResult.CrawlContext = _crawlContext; _crawlComplete = false; _logger.InfoFormat("About to crawl site [{0}]", uri.AbsoluteUri); PrintConfigValues(_crawlContext.CrawlConfiguration); if (_memoryManager != null) { _crawlContext.MemoryUsageBeforeCrawlInMb = _memoryManager.GetCurrentUsageInMb(); _logger.InfoFormat("Starting memory usage for site [{0}] is [{1}mb]", uri.AbsoluteUri, _crawlContext.MemoryUsageBeforeCrawlInMb); } _crawlContext.CrawlStartDate = DateTime.Now; Stopwatch timer = Stopwatch.StartNew(); if (_crawlContext.CrawlConfiguration.CrawlTimeoutSeconds > 0) { _timeoutTimer = new Timer(_crawlContext.CrawlConfiguration.CrawlTimeoutSeconds * 1000); _timeoutTimer.Elapsed += HandleCrawlTimeout; _timeoutTimer.Start(); } try { PageToCrawl rootPage = new PageToCrawl(uri) { ParentUri = uri, IsInternal = true, IsRoot = true }; if (ShouldSchedulePageLink(rootPage)) { _scheduler.Add(rootPage); } VerifyRequiredAvailableMemory(); CrawlSite(); } catch (Exception e) { _crawlResult.ErrorException = e; _logger.FatalFormat("An error occurred while crawling site [{0}]", uri); _logger.Fatal(e); } finally { if (_threadManager != null) { _threadManager.Dispose(); } } if (_timeoutTimer != null) { _timeoutTimer.Stop(); } timer.Stop(); if (_memoryManager != null) { _crawlContext.MemoryUsageAfterCrawlInMb = _memoryManager.GetCurrentUsageInMb(); _logger.InfoFormat("Ending memory usage for site [{0}] is [{1}mb]", uri.AbsoluteUri, _crawlContext.MemoryUsageAfterCrawlInMb); } _crawlResult.Elapsed = timer.Elapsed; _logger.InfoFormat("Crawl complete for site [{0}]: Crawled [{1}] pages in [{2}]", _crawlResult.RootUri.AbsoluteUri, _crawlResult.CrawlContext.CrawledCount, _crawlResult.Elapsed); return(_crawlResult); }
/// <summary> /// Begins a synchronous crawl using the uri param, subscribe to events to process data as it becomes available /// </summary> public virtual async Task <CrawlResult> CrawlAsync(Uri uri, CancellationTokenSource cancellationTokenSource) { if (uri == null) { throw new ArgumentNullException(nameof(uri)); } _crawlContext.RootUri = _crawlContext.OriginalRootUri = uri; if (cancellationTokenSource != null) { _crawlContext.CancellationTokenSource = cancellationTokenSource; } _crawlResult = new CrawlResult(); _crawlResult.RootUri = _crawlContext.RootUri; _crawlResult.CrawlContext = _crawlContext; _crawlComplete = false; _logger.LogInformation($"About to crawl site [{uri.AbsoluteUri}]"); PrintConfigValues(_crawlContext.CrawlConfiguration); if (_memoryManager != null) { _crawlContext.MemoryUsageBeforeCrawlInMb = _memoryManager.GetCurrentUsageInMb(); _logger.LogInformation($"Starting memory usage for site [{uri.AbsoluteUri}] is [{_crawlContext.MemoryUsageBeforeCrawlInMb}mb]"); } _crawlContext.CrawlStartDate = DateTime.Now; var timer = Stopwatch.StartNew(); if (_crawlContext.CrawlConfiguration.CrawlTimeoutSeconds > 0) { _timeoutTimer = new Timer(HandleCrawlTimeout, null, 0, _crawlContext.CrawlConfiguration.CrawlTimeoutSeconds * 1000); } try { var rootPage = new PageToCrawl(uri) { ParentUri = uri, IsInternal = true, IsRoot = true }; if (ShouldSchedulePageLink(rootPage)) { _scheduler.Add(rootPage); } VerifyRequiredAvailableMemory(); await CrawlSite(); } catch (Exception e) { _crawlResult.ErrorException = e; _logger.LogCritical($"An error occurred while crawling site [{uri}]", e); } finally { _threadManager?.Dispose(); } _timeoutTimer?.Dispose(); timer.Stop(); if (_memoryManager != null) { _crawlContext.MemoryUsageAfterCrawlInMb = _memoryManager.GetCurrentUsageInMb(); _logger.LogInformation($"Ending memory usage for site [{uri.AbsoluteUri}] is [{_crawlContext.MemoryUsageAfterCrawlInMb}mb]"); } _crawlResult.Elapsed = timer.Elapsed; _logger.LogInformation($"Crawl complete for site [{_crawlResult.RootUri.AbsoluteUri}]: Crawled [{_crawlResult.CrawlContext.CrawledCount}] pages in [{_crawlResult.Elapsed}]"); return(_crawlResult); }