Esempio n. 1
0
        public async Task <Dictionary <string, int> > CrawlAsync(CrawlingOptions options, CancellationToken cancellationToken)
        {
            if (options == null)
            {
                throw new ArgumentNullException(nameof(options), "Options cannot be null.");
            }

            try
            {
                var processedUrls = new Dictionary <string, int>();

                var urlsToProcess = new Queue <string>();
                urlsToProcess.Enqueue(options.BaseUri.ToString());

                while (urlsToProcess.Count > 0)
                {
                    var url = urlsToProcess.Dequeue();

                    if (processedUrls.ContainsKey(url))
                    {
                        continue;
                    }

                    var pageContent = await _pageLoaderService.LoadPageContentAsync(url);

                    var count = _scrapingService.CountOccurrence(options.Expression, pageContent);
                    processedUrls.Add(url, count);

                    var hrefs = _scrapingService.GetRelativeHrefs(pageContent);

                    foreach (var href in hrefs)
                    {
                        var absoluteUri = new Uri(options.DomainUri, href).ToString();
                        var isSubPage   = options.BaseUrlWithoutParameters.IsSubPage(absoluteUri);

                        if (isSubPage)
                        {
                            urlsToProcess.Enqueue(absoluteUri);
                        }
                    }

                    cancellationToken.ThrowIfCancellationRequested();
                }

                return(processedUrls);
            }
            catch (Exception e)
            {
                _logger.LogError(e, "Error while crawling.");
                throw;
            }
        }
        private async Task <Dictionary <string, int> > CrawlRecursive(
            CrawlingOptions crawlOptions,
            CancellationToken cancellationToken,
            string url = null,
            Dictionary <string, int> processedUrls = null)
        {
            cancellationToken.ThrowIfCancellationRequested();

            if (string.IsNullOrWhiteSpace(url))
            {
                url = crawlOptions.BaseUri.ToString();
            }

            if (processedUrls == null)
            {
                processedUrls = new Dictionary <string, int>();
            }

            var pageContent = await _pageLoaderService.LoadPageContentAsync(url);

            var count = _scrapingService.CountOccurrence(crawlOptions.Expression, pageContent);

            processedUrls.Add(url, count);

            var hrefs = _scrapingService.GetRelativeHrefs(pageContent);

            foreach (var href in hrefs)
            {
                var absoluteUri = new Uri(crawlOptions.DomainUri, href).ToString();
                var isSubPage   = crawlOptions.BaseUrlWithoutParameters.IsSubPage(absoluteUri);

                if (isSubPage && !processedUrls.ContainsKey(absoluteUri))
                {
                    await CrawlRecursive(crawlOptions, cancellationToken, absoluteUri, processedUrls);
                }
            }

            return(processedUrls);
        }