public async Task ShouldObserveMaxDepth() { var totalHyperlinkCount = 1; HttpContent getRandomHtml(HttpRequestMessage message) { var linkCount = new Random().Next(10, 25); Interlocked.Add(ref totalHyperlinkCount, linkCount); return(_html.GetHtmlWithHyperlinks(linkCount)); } var messageHandler = new MockHttpMessageHandler(); var httpClient = messageHandler.ToHttpClient(); messageHandler.When("http://host.com/*") .Respond(m => getRandomHtml(m)); var settings = new HyperlinkCrawlerSettings(); var request = new HyperlinkCrawlRequest("http://host.com/page") { MaxDepth = 5 }; var hcs = new HyperlinkCrawlerStrategy(settings, httpClient, _logger, request); await hcs.ProcessRequestAsync(request, CancellationToken.None); await hcs.Completion; Assert.Equal( 6, hcs.State.ProcessedUriCount + hcs.State.FailedUriCount); }
public async Task ShouldProcessAllLinks() { var totalHyperlinkCount = 1; // initial url var deterministicHyperlinkCountdown = 11; HttpContent getRandomHtml(HttpRequestMessage _) { var linkCount = Interlocked.Decrement(ref deterministicHyperlinkCountdown); if (linkCount < 0) { linkCount = 0; } Interlocked.Add(ref totalHyperlinkCount, linkCount); return(_html.GetHtmlWithHyperlinks(linkCount)); } var messageHandler = new MockHttpMessageHandler(); messageHandler.When("http://host.com/*") .Respond(m => getRandomHtml(m)); var httpClient = messageHandler.ToHttpClient(); // determines the TimeSpan that auto-completion detection awaits - allowing // for any additional heartbeats that signal there are still URIs being // processed. In a non-unit-test, real crawler scenario, this should be set // to a value that allows for processing URIs over an internet connection, // ie. accounts for latency in dns resolution and server response times httpClient.Timeout = TimeSpan.FromMilliseconds(300); var settings = new HyperlinkCrawlerSettings(); var request = new HyperlinkCrawlRequest("http://host.com/page"); var hcs = new HyperlinkCrawlerStrategy(settings, httpClient, _logger, request); // act await hcs.ProcessRequestAsync(request, CancellationToken.None); await hcs.Completion; // The deterministicHyperlinkCountdown allows us to pre-calculate the total // number of hyperlinks that will be "found" by being atomically decremented // by 1, until it reaches 0 // ∑(j+1, j=0 to 9) + 1 is commutatively equal to, // (10 + 9 + 8 + 7 + 6 + 5 + 4 + 3 + 2 + 1) + 1 (the initial URI) = 56 Assert.Equal(56, totalHyperlinkCount); Assert.Equal( totalHyperlinkCount, hcs.State.ProcessedUriCount + hcs.State.FailedUriCount); }
public async Task ShouldObserveCompletionWindow() { var totalHyperlinkCount = 1; async Task <HttpResponseMessage> getRandomHtml(HttpRequestMessage message) { await Task.Delay(100 *new Random().Next(1, 3)); var linkCount = new Random().Next(3, 5); Interlocked.Add(ref totalHyperlinkCount, linkCount); return(new HttpResponseMessage(HttpStatusCode.OK) { Content = _html.GetHtmlWithHyperlinks(linkCount) }); } var messageHandler = new MockHttpMessageHandler(); var httpClient = messageHandler.ToHttpClient(); messageHandler.When("http://host.com/*") .Respond(async m => await getRandomHtml(m)); var settings = new HyperlinkCrawlerSettings(); var request = new HyperlinkCrawlRequest("http://host.com/page") { CompletionWindow = TimeSpan.FromSeconds(3) }; var hcs = new HyperlinkCrawlerStrategy(settings, httpClient, _logger, request); // Add on some additional time for ProcessRequestAsync to initialize the state // and dataflow. Crawler.cs does the same. var cts = new CancellationTokenSource(); cts.CancelAfter(request.CompletionWindow.Add(TimeSpan.FromMilliseconds(200))); await hcs.ProcessRequestAsync(request, cts.Token); await Task.Delay(request.CompletionWindow.Add(TimeSpan.FromSeconds(1))); Assert.True(hcs.State.ProcessedUriCount < totalHyperlinkCount); Assert.Equal(CrawlJobStatus.Completed, hcs.State.Status); }
public async Task ShouldStopProcessingViaCancellation() { var totalHyperlinkCount = 1; async Task <HttpResponseMessage> getRandomHtml(HttpRequestMessage message) { await Task.Delay(100 *new Random().Next(1, 5)); var linkCount = new Random().Next(1, 5); Interlocked.Add(ref totalHyperlinkCount, linkCount); return(new HttpResponseMessage(HttpStatusCode.OK) { Content = _html.GetHtmlWithHyperlinks(linkCount) }); } var messageHandler = new MockHttpMessageHandler(); var httpClient = messageHandler.ToHttpClient(); messageHandler.When("http://host.com/*") .Respond(async m => await getRandomHtml(m)); var settings = new HyperlinkCrawlerSettings(); var request = new HyperlinkCrawlRequest("http://host.com/page"); var hcs = new HyperlinkCrawlerStrategy(settings, httpClient, _logger, request); var cts = new CancellationTokenSource(); await hcs.ProcessRequestAsync(request, cts.Token); await Task.Delay(TimeSpan.FromSeconds(1)); Assert.True(hcs.State.ProcessedUriCount > 1); cts.Cancel(); await hcs.Completion; Assert.True(hcs.State.ProcessedUriCount < totalHyperlinkCount); Assert.Equal(CrawlJobStatus.Cancelled, hcs.State.Status); }
public async Task ShouldProcessAllLinksIgnoringDuplicates() { HttpContent getRandomHtml(HttpRequestMessage _) { return(_html.GetHtmlWithDuplicateHyperlinks(6)); } var messageHandler = new MockHttpMessageHandler(); messageHandler.When("http://host.com/*") .Respond(m => getRandomHtml(m)); var httpClient = messageHandler.ToHttpClient(); // determines the TimeSpan that auto-completion detection awaits - allowing // for any additional heartbeats that signal there are still URIs being // processed. In a non-unit-test, real crawler scenario, this should be set // to a value that allows for processing URIs over an internet connection, // ie. accounts for latency in dns resolution and server response times httpClient.Timeout = TimeSpan.FromMilliseconds(300); var settings = new HyperlinkCrawlerSettings(); var request = new HyperlinkCrawlRequest("http://host.com/page"); var hcs = new HyperlinkCrawlerStrategy(settings, httpClient, _logger, request); // act await hcs.ProcessRequestAsync(request, CancellationToken.None); await hcs.Completion; // should only discover 1 distinct URI since all 6 are exact Assert.Equal(1, hcs.State.DiscoveredUriCount); // should process 2 URIs; the original and the 1 discovered Assert.Equal(2, hcs.State.ProcessedUriCount + hcs.State.FailedUriCount); }