private async Task DoCrawling( Crawling crawling, ICrawlingRepository repository, CancellationToken stoppingToken) { try { crawling.Status = CrawlingStatus.InProgress; await repository.UpdateCrawlingAsync(crawling); var options = new CrawlingOptions(crawling.Expression, crawling.Url); var crawlingResult = await _crawlingService.CrawlAsync(options, stoppingToken); crawling.CrawlingDetails = _mapper.MapDetails(crawling.Id, crawlingResult); crawling.Status = CrawlingStatus.Completed; await repository.UpdateCrawlingAsync(crawling); } catch (Exception e) { crawling.Status = CrawlingStatus.Failed; crawling.StatusText = e.ToString(); await repository.UpdateCrawlingAsync(crawling); _logger.LogError(e, $"Error while crawling {crawling?.Url} in background service."); } }
public void Ctor_ExpressionAndUrlAreValid_SuccessfullySetAllTheProperties() { var options = new CrawlingOptions("expression", "http://url.com/testpage.html"); options.Expression.Should().Be("expression"); options.BaseUri.ToString().Should().Be("http://url.com/testpage.html"); options.DomainUri.ToString().Should().Be("http://url.com/"); }
public void Ctor_UrlContainsQueryParameters_SetRawBaseUrlWithoutParametersValueWithoutQueryParameters( string url, string exoectedResult) { var options = new CrawlingOptions("expression", url); options.BaseUrlWithoutParameters.Should().Be(exoectedResult); }
public async Task <Dictionary <string, int> > CrawlAsync(CrawlingOptions options, CancellationToken cancellationToken) { if (options == null) { throw new ArgumentNullException(nameof(options), "Options cannot be null."); } try { var processedUrls = new Dictionary <string, int>(); var urlsToProcess = new Queue <string>(); urlsToProcess.Enqueue(options.BaseUri.ToString()); while (urlsToProcess.Count > 0) { var url = urlsToProcess.Dequeue(); if (processedUrls.ContainsKey(url)) { continue; } var pageContent = await _pageLoaderService.LoadPageContentAsync(url); var count = _scrapingService.CountOccurrence(options.Expression, pageContent); processedUrls.Add(url, count); var hrefs = _scrapingService.GetRelativeHrefs(pageContent); foreach (var href in hrefs) { var absoluteUri = new Uri(options.DomainUri, href).ToString(); var isSubPage = options.BaseUrlWithoutParameters.IsSubPage(absoluteUri); if (isSubPage) { urlsToProcess.Enqueue(absoluteUri); } } cancellationToken.ThrowIfCancellationRequested(); } return(processedUrls); } catch (Exception e) { _logger.LogError(e, "Error while crawling."); throw; } }
public async Task <Dictionary <string, int> > CrawlAsync(CrawlingOptions options, CancellationToken cancellationToken) { if (options == null) { throw new ArgumentNullException(nameof(options), "Options cannot be null."); } try { return(await CrawlRecursive(options, cancellationToken)); } catch (Exception e) { _logger.LogError(e, "Error while crawling."); throw; } }
private async Task <Dictionary <string, int> > CrawlRecursive( CrawlingOptions crawlOptions, CancellationToken cancellationToken, string url = null, Dictionary <string, int> processedUrls = null) { cancellationToken.ThrowIfCancellationRequested(); if (string.IsNullOrWhiteSpace(url)) { url = crawlOptions.BaseUri.ToString(); } if (processedUrls == null) { processedUrls = new Dictionary <string, int>(); } var pageContent = await _pageLoaderService.LoadPageContentAsync(url); var count = _scrapingService.CountOccurrence(crawlOptions.Expression, pageContent); processedUrls.Add(url, count); var hrefs = _scrapingService.GetRelativeHrefs(pageContent); foreach (var href in hrefs) { var absoluteUri = new Uri(crawlOptions.DomainUri, href).ToString(); var isSubPage = crawlOptions.BaseUrlWithoutParameters.IsSubPage(absoluteUri); if (isSubPage && !processedUrls.ContainsKey(absoluteUri)) { await CrawlRecursive(crawlOptions, cancellationToken, absoluteUri, processedUrls); } } return(processedUrls); }
public async Task CrawlAsync_ValidOptionsPassed_PerformsCrawling() { // Arrange var expectedResult = new Dictionary <string, int> { { "http://rooturl.com/index", 1 }, { "http://rooturl.com/index/home", 0 }, { "http://rooturl.com/index/home/subhome", 2 } }; var expression = "hello world!"; var options = new CrawlingOptions(expression, "http://rooturl.com/index"); // First set of Mocked data _pageLoaderServiceMock .Setup(x => x.LoadPageContentAsync("http://rooturl.com/index")) .Returns(Task.FromResult("root html")); _scrapingServiceMock .Setup(x => x.CountOccurrence(expression, "root html")) .Returns(1); _scrapingServiceMock .Setup(x => x.GetRelativeHrefs("root html")) .Returns(new[] { "/index/home", "/about" }); // Second set of Mocked data _pageLoaderServiceMock .Setup(x => x.LoadPageContentAsync("http://rooturl.com/index/home")) .Returns(Task.FromResult("home html")); _scrapingServiceMock .Setup(x => x.CountOccurrence(expression, "home html")) .Returns(0); _scrapingServiceMock .Setup(x => x.GetRelativeHrefs("home html")) .Returns(new[] { "index/home/subhome" }); // Third set of Mocked data _pageLoaderServiceMock .Setup(x => x.LoadPageContentAsync("http://rooturl.com/index/home/subhome")) .Returns(Task.FromResult("subhome html")); _scrapingServiceMock .Setup(x => x.CountOccurrence(expression, "subhome html")) .Returns(2); _scrapingServiceMock .Setup(x => x.GetRelativeHrefs("subhome html")) .Returns(new[] { "/home" }); // Act var actualResult = await _sut.CrawlAsync(options, CancellationToken.None); // Assert actualResult.Should().BeEquivalentTo(expectedResult); }
public void Ctor_ExpressionIsPassed_TrimsExpression() { var options = new CrawlingOptions(" some test expression ", "http://www.someurl.com"); options.Expression.Should().Be("some test expression"); }