private async Task DoCrawling(
            Crawling crawling,
            ICrawlingRepository repository,
            CancellationToken stoppingToken)
        {
            try
            {
                crawling.Status = CrawlingStatus.InProgress;
                await repository.UpdateCrawlingAsync(crawling);

                var options        = new CrawlingOptions(crawling.Expression, crawling.Url);
                var crawlingResult = await _crawlingService.CrawlAsync(options, stoppingToken);

                crawling.CrawlingDetails = _mapper.MapDetails(crawling.Id, crawlingResult);
                crawling.Status          = CrawlingStatus.Completed;
                await repository.UpdateCrawlingAsync(crawling);
            }
            catch (Exception e)
            {
                crawling.Status     = CrawlingStatus.Failed;
                crawling.StatusText = e.ToString();
                await repository.UpdateCrawlingAsync(crawling);

                _logger.LogError(e, $"Error while crawling {crawling?.Url} in background service.");
            }
        }
Example #2
0
        public void Ctor_ExpressionAndUrlAreValid_SuccessfullySetAllTheProperties()
        {
            var options = new CrawlingOptions("expression", "http://url.com/testpage.html");

            options.Expression.Should().Be("expression");
            options.BaseUri.ToString().Should().Be("http://url.com/testpage.html");
            options.DomainUri.ToString().Should().Be("http://url.com/");
        }
Example #3
0
        public void Ctor_UrlContainsQueryParameters_SetRawBaseUrlWithoutParametersValueWithoutQueryParameters(
            string url,
            string exoectedResult)
        {
            var options = new CrawlingOptions("expression", url);

            options.BaseUrlWithoutParameters.Should().Be(exoectedResult);
        }
Example #4
0
        public async Task <Dictionary <string, int> > CrawlAsync(CrawlingOptions options, CancellationToken cancellationToken)
        {
            if (options == null)
            {
                throw new ArgumentNullException(nameof(options), "Options cannot be null.");
            }

            try
            {
                var processedUrls = new Dictionary <string, int>();

                var urlsToProcess = new Queue <string>();
                urlsToProcess.Enqueue(options.BaseUri.ToString());

                while (urlsToProcess.Count > 0)
                {
                    var url = urlsToProcess.Dequeue();

                    if (processedUrls.ContainsKey(url))
                    {
                        continue;
                    }

                    var pageContent = await _pageLoaderService.LoadPageContentAsync(url);

                    var count = _scrapingService.CountOccurrence(options.Expression, pageContent);
                    processedUrls.Add(url, count);

                    var hrefs = _scrapingService.GetRelativeHrefs(pageContent);

                    foreach (var href in hrefs)
                    {
                        var absoluteUri = new Uri(options.DomainUri, href).ToString();
                        var isSubPage   = options.BaseUrlWithoutParameters.IsSubPage(absoluteUri);

                        if (isSubPage)
                        {
                            urlsToProcess.Enqueue(absoluteUri);
                        }
                    }

                    cancellationToken.ThrowIfCancellationRequested();
                }

                return(processedUrls);
            }
            catch (Exception e)
            {
                _logger.LogError(e, "Error while crawling.");
                throw;
            }
        }
        public async Task <Dictionary <string, int> > CrawlAsync(CrawlingOptions options, CancellationToken cancellationToken)
        {
            if (options == null)
            {
                throw new ArgumentNullException(nameof(options), "Options cannot be null.");
            }

            try
            {
                return(await CrawlRecursive(options, cancellationToken));
            }
            catch (Exception e)
            {
                _logger.LogError(e, "Error while crawling.");
                throw;
            }
        }
        private async Task <Dictionary <string, int> > CrawlRecursive(
            CrawlingOptions crawlOptions,
            CancellationToken cancellationToken,
            string url = null,
            Dictionary <string, int> processedUrls = null)
        {
            cancellationToken.ThrowIfCancellationRequested();

            if (string.IsNullOrWhiteSpace(url))
            {
                url = crawlOptions.BaseUri.ToString();
            }

            if (processedUrls == null)
            {
                processedUrls = new Dictionary <string, int>();
            }

            var pageContent = await _pageLoaderService.LoadPageContentAsync(url);

            var count = _scrapingService.CountOccurrence(crawlOptions.Expression, pageContent);

            processedUrls.Add(url, count);

            var hrefs = _scrapingService.GetRelativeHrefs(pageContent);

            foreach (var href in hrefs)
            {
                var absoluteUri = new Uri(crawlOptions.DomainUri, href).ToString();
                var isSubPage   = crawlOptions.BaseUrlWithoutParameters.IsSubPage(absoluteUri);

                if (isSubPage && !processedUrls.ContainsKey(absoluteUri))
                {
                    await CrawlRecursive(crawlOptions, cancellationToken, absoluteUri, processedUrls);
                }
            }

            return(processedUrls);
        }
        public async Task CrawlAsync_ValidOptionsPassed_PerformsCrawling()
        {
            // Arrange

            var expectedResult = new Dictionary <string, int>
            {
                { "http://rooturl.com/index", 1 },
                { "http://rooturl.com/index/home", 0 },
                { "http://rooturl.com/index/home/subhome", 2 }
            };

            var expression = "hello world!";
            var options    = new CrawlingOptions(expression, "http://rooturl.com/index");

            // First set of Mocked data
            _pageLoaderServiceMock
            .Setup(x => x.LoadPageContentAsync("http://rooturl.com/index"))
            .Returns(Task.FromResult("root html"));

            _scrapingServiceMock
            .Setup(x => x.CountOccurrence(expression, "root html"))
            .Returns(1);

            _scrapingServiceMock
            .Setup(x => x.GetRelativeHrefs("root html"))
            .Returns(new[] { "/index/home", "/about" });


            // Second set of Mocked data
            _pageLoaderServiceMock
            .Setup(x => x.LoadPageContentAsync("http://rooturl.com/index/home"))
            .Returns(Task.FromResult("home html"));

            _scrapingServiceMock
            .Setup(x => x.CountOccurrence(expression, "home html"))
            .Returns(0);

            _scrapingServiceMock
            .Setup(x => x.GetRelativeHrefs("home html"))
            .Returns(new[] { "index/home/subhome" });


            // Third set of Mocked data

            _pageLoaderServiceMock
            .Setup(x => x.LoadPageContentAsync("http://rooturl.com/index/home/subhome"))
            .Returns(Task.FromResult("subhome html"));

            _scrapingServiceMock
            .Setup(x => x.CountOccurrence(expression, "subhome html"))
            .Returns(2);

            _scrapingServiceMock
            .Setup(x => x.GetRelativeHrefs("subhome html"))
            .Returns(new[] { "/home" });

            // Act
            var actualResult = await _sut.CrawlAsync(options, CancellationToken.None);

            // Assert

            actualResult.Should().BeEquivalentTo(expectedResult);
        }
Example #8
0
        public void Ctor_ExpressionIsPassed_TrimsExpression()
        {
            var options = new CrawlingOptions(" some test expression ", "http://www.someurl.com");

            options.Expression.Should().Be("some test expression");
        }