public async Task <Dictionary <string, int> > CrawlAsync(CrawlingOptions options, CancellationToken cancellationToken) { if (options == null) { throw new ArgumentNullException(nameof(options), "Options cannot be null."); } try { var processedUrls = new Dictionary <string, int>(); var urlsToProcess = new Queue <string>(); urlsToProcess.Enqueue(options.BaseUri.ToString()); while (urlsToProcess.Count > 0) { var url = urlsToProcess.Dequeue(); if (processedUrls.ContainsKey(url)) { continue; } var pageContent = await _pageLoaderService.LoadPageContentAsync(url); var count = _scrapingService.CountOccurrence(options.Expression, pageContent); processedUrls.Add(url, count); var hrefs = _scrapingService.GetRelativeHrefs(pageContent); foreach (var href in hrefs) { var absoluteUri = new Uri(options.DomainUri, href).ToString(); var isSubPage = options.BaseUrlWithoutParameters.IsSubPage(absoluteUri); if (isSubPage) { urlsToProcess.Enqueue(absoluteUri); } } cancellationToken.ThrowIfCancellationRequested(); } return(processedUrls); } catch (Exception e) { _logger.LogError(e, "Error while crawling."); throw; } }
private async Task <Dictionary <string, int> > CrawlRecursive( CrawlingOptions crawlOptions, CancellationToken cancellationToken, string url = null, Dictionary <string, int> processedUrls = null) { cancellationToken.ThrowIfCancellationRequested(); if (string.IsNullOrWhiteSpace(url)) { url = crawlOptions.BaseUri.ToString(); } if (processedUrls == null) { processedUrls = new Dictionary <string, int>(); } var pageContent = await _pageLoaderService.LoadPageContentAsync(url); var count = _scrapingService.CountOccurrence(crawlOptions.Expression, pageContent); processedUrls.Add(url, count); var hrefs = _scrapingService.GetRelativeHrefs(pageContent); foreach (var href in hrefs) { var absoluteUri = new Uri(crawlOptions.DomainUri, href).ToString(); var isSubPage = crawlOptions.BaseUrlWithoutParameters.IsSubPage(absoluteUri); if (isSubPage && !processedUrls.ContainsKey(absoluteUri)) { await CrawlRecursive(crawlOptions, cancellationToken, absoluteUri, processedUrls); } } return(processedUrls); }