public async Task ShouldObserveMaxDepth() { var totalHyperlinkCount = 1; HttpContent getRandomHtml(HttpRequestMessage message) { var linkCount = new Random().Next(10, 25); Interlocked.Add(ref totalHyperlinkCount, linkCount); return(_html.GetHtmlWithHyperlinks(linkCount)); } var messageHandler = new MockHttpMessageHandler(); var httpClient = messageHandler.ToHttpClient(); messageHandler.When("http://host.com/*") .Respond(m => getRandomHtml(m)); var settings = new HyperlinkCrawlerSettings(); var request = new HyperlinkCrawlRequest("http://host.com/page") { MaxDepth = 5 }; var hcs = new HyperlinkCrawlerStrategy(settings, httpClient, _logger, request); await hcs.ProcessRequestAsync(request, CancellationToken.None); await hcs.Completion; Assert.Equal( 6, hcs.State.ProcessedUriCount + hcs.State.FailedUriCount); }
public async Task <IActionResult> Post(HyperlinkCrawlRequest request) { if (request.Uri == null || !request.Uri.IsAbsoluteUri) { return(UnprocessableEntity()); } var manager = _client.GetGrain <IManageCrawlRequests>(request.Id); var status = await manager.Enqueue(request); if (status == CrawlJobStatus.Pending) { return(Accepted(Url.RouteUrl("GetResult", new { id = request.Id }))); } if (status == CrawlJobStatus.Running || status == CrawlJobStatus.Completed) { return(Created(Url.RouteUrl("GetResult", new { id = request.Id }), status)); } // if already running without specifying HyperlinkCrawlRequest.ForceNew if (status == CrawlJobStatus.Conflict) { return(Conflict()); } return(BadRequest()); }
public async Task ShouldProcessAllLinks() { var totalHyperlinkCount = 1; // initial url var deterministicHyperlinkCountdown = 11; HttpContent getRandomHtml(HttpRequestMessage _) { var linkCount = Interlocked.Decrement(ref deterministicHyperlinkCountdown); if (linkCount < 0) { linkCount = 0; } Interlocked.Add(ref totalHyperlinkCount, linkCount); return(_html.GetHtmlWithHyperlinks(linkCount)); } var messageHandler = new MockHttpMessageHandler(); messageHandler.When("http://host.com/*") .Respond(m => getRandomHtml(m)); var httpClient = messageHandler.ToHttpClient(); // determines the TimeSpan that auto-completion detection awaits - allowing // for any additional heartbeats that signal there are still URIs being // processed. In a non-unit-test, real crawler scenario, this should be set // to a value that allows for processing URIs over an internet connection, // ie. accounts for latency in dns resolution and server response times httpClient.Timeout = TimeSpan.FromMilliseconds(300); var settings = new HyperlinkCrawlerSettings(); var request = new HyperlinkCrawlRequest("http://host.com/page"); var hcs = new HyperlinkCrawlerStrategy(settings, httpClient, _logger, request); // act await hcs.ProcessRequestAsync(request, CancellationToken.None); await hcs.Completion; // The deterministicHyperlinkCountdown allows us to pre-calculate the total // number of hyperlinks that will be "found" by being atomically decremented // by 1, until it reaches 0 // ∑(j+1, j=0 to 9) + 1 is commutatively equal to, // (10 + 9 + 8 + 7 + 6 + 5 + 4 + 3 + 2 + 1) + 1 (the initial URI) = 56 Assert.Equal(56, totalHyperlinkCount); Assert.Equal( totalHyperlinkCount, hcs.State.ProcessedUriCount + hcs.State.FailedUriCount); }
public async Task ShouldObserveCompletionWindow() { var totalHyperlinkCount = 1; async Task <HttpResponseMessage> getRandomHtml(HttpRequestMessage message) { await Task.Delay(100 *new Random().Next(1, 3)); var linkCount = new Random().Next(3, 5); Interlocked.Add(ref totalHyperlinkCount, linkCount); return(new HttpResponseMessage(HttpStatusCode.OK) { Content = _html.GetHtmlWithHyperlinks(linkCount) }); } var messageHandler = new MockHttpMessageHandler(); var httpClient = messageHandler.ToHttpClient(); messageHandler.When("http://host.com/*") .Respond(async m => await getRandomHtml(m)); var settings = new HyperlinkCrawlerSettings(); var request = new HyperlinkCrawlRequest("http://host.com/page") { CompletionWindow = TimeSpan.FromSeconds(3) }; var hcs = new HyperlinkCrawlerStrategy(settings, httpClient, _logger, request); // Add on some additional time for ProcessRequestAsync to initialize the state // and dataflow. Crawler.cs does the same. var cts = new CancellationTokenSource(); cts.CancelAfter(request.CompletionWindow.Add(TimeSpan.FromMilliseconds(200))); await hcs.ProcessRequestAsync(request, cts.Token); await Task.Delay(request.CompletionWindow.Add(TimeSpan.FromSeconds(1))); Assert.True(hcs.State.ProcessedUriCount < totalHyperlinkCount); Assert.Equal(CrawlJobStatus.Completed, hcs.State.Status); }
public async Task ShouldStopProcessingViaCancellation() { var totalHyperlinkCount = 1; async Task <HttpResponseMessage> getRandomHtml(HttpRequestMessage message) { await Task.Delay(100 *new Random().Next(1, 5)); var linkCount = new Random().Next(1, 5); Interlocked.Add(ref totalHyperlinkCount, linkCount); return(new HttpResponseMessage(HttpStatusCode.OK) { Content = _html.GetHtmlWithHyperlinks(linkCount) }); } var messageHandler = new MockHttpMessageHandler(); var httpClient = messageHandler.ToHttpClient(); messageHandler.When("http://host.com/*") .Respond(async m => await getRandomHtml(m)); var settings = new HyperlinkCrawlerSettings(); var request = new HyperlinkCrawlRequest("http://host.com/page"); var hcs = new HyperlinkCrawlerStrategy(settings, httpClient, _logger, request); var cts = new CancellationTokenSource(); await hcs.ProcessRequestAsync(request, cts.Token); await Task.Delay(TimeSpan.FromSeconds(1)); Assert.True(hcs.State.ProcessedUriCount > 1); cts.Cancel(); await hcs.Completion; Assert.True(hcs.State.ProcessedUriCount < totalHyperlinkCount); Assert.Equal(CrawlJobStatus.Cancelled, hcs.State.Status); }
public async Task ShouldExecuteEnqueuedCrawlRequest() { var request = new HyperlinkCrawlRequest(_crawlUri) { MaxDepth = 5 }; var hangfire = HangfireTestUtil.SetupHangfire(); using (var _ = hangfire.StartServer()) { hangfire.BackgroundJobClient.Enqueue <CrawlJobPerformer>(p => p.Perform(request)); await Task.Delay(TimeSpan.FromSeconds(2)); } hangfire.Strategy.Verify(s => s.ProcessRequestAsync( It.IsAny <CrawlRequestBase>(), It.IsAny <CancellationToken>()), Times.Once); }
public void ShouldExecuteCrawlRequest() { var request = new HyperlinkCrawlRequest(_crawlUri) { MaxDepth = 5 }; var hangfire = HangfireTestUtil.SetupHangfire(); var job = Job.FromExpression((CrawlJobPerformer p) => p.Perform(request)); var backgroundJob = new BackgroundJob( Guid.NewGuid().ToString("N"), job, DateTime.UtcNow); hangfire.Perform(backgroundJob); hangfire.Strategy.Verify(s => s.ProcessRequestAsync( It.IsAny <CrawlRequestBase>(), It.IsAny <CancellationToken>()), Times.Once); }
public async Task ShouldProcessAllLinksIgnoringDuplicates() { HttpContent getRandomHtml(HttpRequestMessage _) { return(_html.GetHtmlWithDuplicateHyperlinks(6)); } var messageHandler = new MockHttpMessageHandler(); messageHandler.When("http://host.com/*") .Respond(m => getRandomHtml(m)); var httpClient = messageHandler.ToHttpClient(); // determines the TimeSpan that auto-completion detection awaits - allowing // for any additional heartbeats that signal there are still URIs being // processed. In a non-unit-test, real crawler scenario, this should be set // to a value that allows for processing URIs over an internet connection, // ie. accounts for latency in dns resolution and server response times httpClient.Timeout = TimeSpan.FromMilliseconds(300); var settings = new HyperlinkCrawlerSettings(); var request = new HyperlinkCrawlRequest("http://host.com/page"); var hcs = new HyperlinkCrawlerStrategy(settings, httpClient, _logger, request); // act await hcs.ProcessRequestAsync(request, CancellationToken.None); await hcs.Completion; // should only discover 1 distinct URI since all 6 are exact Assert.Equal(1, hcs.State.DiscoveredUriCount); // should process 2 URIs; the original and the 1 discovered Assert.Equal(2, hcs.State.ProcessedUriCount + hcs.State.FailedUriCount); }
public async Task ShouldBeAbleToCheckStatus() { bool onOrleansTaskScheduler = false, onTheadPoolTaskScheduler = false; // setup mocks for DI ConfigureContainer = builder => { var strategyMock = new Mock <ICrawlerStrategy>(); strategyMock.Setup(strategy => strategy.ProcessRequestAsync( It.IsAny <CrawlRequestBase>(), It.IsAny <CancellationToken>())) .Returns(async(CrawlRequestBase req, Func <CrawlStateSnapshot, Task> saveProgress, CancellationToken token) => { // mock that we're now processing await saveProgress(new HyperlinkCrawlStateSnapshot(CrawlJobStatus.Running)); // assert that we are in the Orleans TaskScheduler onOrleansTaskScheduler = TaskScheduler.Default != TaskScheduler.Current; // simulates Dataflow blocks by running on ThreadPool threads // Task.Run automatically uses the ThreadPool TaskScheduler await Task.Run(async() => { onTheadPoolTaskScheduler = TaskScheduler.Default == TaskScheduler.Current; await Task.Delay(TimeSpan.FromSeconds(5)); }); }); var strategyFactoryMock = new Mock <ICrawlerStrategyFactory>(); strategyFactoryMock.Setup(factory => factory.Create(It.IsAny <CrawlRequestBase>())) .Returns(strategyMock.Object); builder.RegisterInstance(strategyMock.Object) .As <ICrawlerStrategy>(); builder.RegisterInstance(strategyFactoryMock.Object) .As <ICrawlerStrategyFactory>(); }; var clusterBuilder = new TestClusterBuilder(1); clusterBuilder.AddClientBuilderConfigurator <TestSiloBuilder>(); clusterBuilder.AddSiloBuilderConfigurator <TestSiloBuilder>(); _cluster = clusterBuilder.Build(); _cluster.Deploy(); var request = new HyperlinkCrawlRequest("http://host.com/"); // Enqueue request from client var manager = _cluster.Client.GetGrain <IManageCrawlRequests>(request.Id); var status = await manager.Enqueue(request); Assert.Equal(CrawlJobStatus.Pending, status); // simulate Silo/Hangfire Worker thread (ie. non-threadpool thread) // don't await since we want to test that we can re-enter the Grain to check // the status of the job var processing = Task.Factory.StartNew(async() => { var crawler = _cluster.GrainFactory.GetGrain <IProcessCrawlRequests>(request.Id); await crawler.Process(request); }, TaskCreationOptions.LongRunning).Unwrap(); // allow the processing task to start working await Task.Delay(TimeSpan.FromSeconds(1)); status = await manager.AsReference <IProcessCrawlRequests>().GetStatus(); await processing; // ~4 seconds; see Task.Delay in mock strategy above - 1 second Assert.Equal(CrawlJobStatus.Running, status); // assert that ICrawlerStrategy.ProcessRequestAsync runs on Orleans TaskScheduler Assert.True(onOrleansTaskScheduler); // assert that Dataflow blocks use ThreadPool threads; // escaping single-threaded execution model Assert.True(onTheadPoolTaskScheduler); }