public async Task ShouldObserveMaxDepth()
        {
            var totalHyperlinkCount = 1;

            HttpContent getRandomHtml(HttpRequestMessage message)
            {
                var linkCount = new Random().Next(10, 25);

                Interlocked.Add(ref totalHyperlinkCount, linkCount);

                return(_html.GetHtmlWithHyperlinks(linkCount));
            }

            var messageHandler = new MockHttpMessageHandler();
            var httpClient     = messageHandler.ToHttpClient();

            messageHandler.When("http://host.com/*")
            .Respond(m => getRandomHtml(m));

            var settings = new HyperlinkCrawlerSettings();
            var request  = new HyperlinkCrawlRequest("http://host.com/page")
            {
                MaxDepth = 5
            };
            var hcs = new HyperlinkCrawlerStrategy(settings, httpClient, _logger, request);
            await hcs.ProcessRequestAsync(request, CancellationToken.None);

            await hcs.Completion;

            Assert.Equal(
                6,
                hcs.State.ProcessedUriCount + hcs.State.FailedUriCount);
        }
Exemplo n.º 2
0
        public async Task <IActionResult> Post(HyperlinkCrawlRequest request)
        {
            if (request.Uri == null || !request.Uri.IsAbsoluteUri)
            {
                return(UnprocessableEntity());
            }

            var manager = _client.GetGrain <IManageCrawlRequests>(request.Id);
            var status  = await manager.Enqueue(request);

            if (status == CrawlJobStatus.Pending)
            {
                return(Accepted(Url.RouteUrl("GetResult", new { id = request.Id })));
            }

            if (status == CrawlJobStatus.Running || status == CrawlJobStatus.Completed)
            {
                return(Created(Url.RouteUrl("GetResult", new { id = request.Id }), status));
            }

            // if already running without specifying HyperlinkCrawlRequest.ForceNew
            if (status == CrawlJobStatus.Conflict)
            {
                return(Conflict());
            }

            return(BadRequest());
        }
        public async Task ShouldProcessAllLinks()
        {
            var totalHyperlinkCount             = 1; // initial url
            var deterministicHyperlinkCountdown = 11;

            HttpContent getRandomHtml(HttpRequestMessage _)
            {
                var linkCount = Interlocked.Decrement(ref deterministicHyperlinkCountdown);

                if (linkCount < 0)
                {
                    linkCount = 0;
                }

                Interlocked.Add(ref totalHyperlinkCount, linkCount);

                return(_html.GetHtmlWithHyperlinks(linkCount));
            }

            var messageHandler = new MockHttpMessageHandler();

            messageHandler.When("http://host.com/*")
            .Respond(m => getRandomHtml(m));

            var httpClient = messageHandler.ToHttpClient();

            // determines the TimeSpan that auto-completion detection awaits - allowing
            // for any additional heartbeats that signal there are still URIs being
            // processed. In a non-unit-test, real crawler scenario, this should be set
            // to a value that allows for processing URIs over an internet connection,
            // ie. accounts for latency in dns resolution and server response times
            httpClient.Timeout = TimeSpan.FromMilliseconds(300);


            var settings = new HyperlinkCrawlerSettings();
            var request  = new HyperlinkCrawlRequest("http://host.com/page");
            var hcs      = new HyperlinkCrawlerStrategy(settings, httpClient, _logger, request);

            // act
            await hcs.ProcessRequestAsync(request, CancellationToken.None);

            await hcs.Completion;

            // The deterministicHyperlinkCountdown allows us to pre-calculate the total
            // number of hyperlinks that will be "found" by being atomically decremented
            // by 1, until it reaches 0
            // ∑(j+1, j=0 to 9) + 1 is commutatively equal to,
            // (10 + 9 + 8 + 7 + 6 + 5 + 4 + 3 + 2 + 1) + 1 (the initial URI) = 56
            Assert.Equal(56, totalHyperlinkCount);

            Assert.Equal(
                totalHyperlinkCount,
                hcs.State.ProcessedUriCount + hcs.State.FailedUriCount);
        }
        public async Task ShouldObserveCompletionWindow()
        {
            var totalHyperlinkCount = 1;

            async Task <HttpResponseMessage> getRandomHtml(HttpRequestMessage message)
            {
                await Task.Delay(100 *new Random().Next(1, 3));

                var linkCount = new Random().Next(3, 5);

                Interlocked.Add(ref totalHyperlinkCount, linkCount);

                return(new HttpResponseMessage(HttpStatusCode.OK)
                {
                    Content = _html.GetHtmlWithHyperlinks(linkCount)
                });
            }

            var messageHandler = new MockHttpMessageHandler();
            var httpClient     = messageHandler.ToHttpClient();

            messageHandler.When("http://host.com/*")
            .Respond(async m => await getRandomHtml(m));

            var settings = new HyperlinkCrawlerSettings();
            var request  = new HyperlinkCrawlRequest("http://host.com/page")
            {
                CompletionWindow = TimeSpan.FromSeconds(3)
            };
            var hcs = new HyperlinkCrawlerStrategy(settings, httpClient, _logger, request);

            // Add on some additional time for ProcessRequestAsync to initialize the state
            // and dataflow. Crawler.cs does the same.
            var cts = new CancellationTokenSource();

            cts.CancelAfter(request.CompletionWindow.Add(TimeSpan.FromMilliseconds(200)));

            await hcs.ProcessRequestAsync(request, cts.Token);

            await Task.Delay(request.CompletionWindow.Add(TimeSpan.FromSeconds(1)));

            Assert.True(hcs.State.ProcessedUriCount < totalHyperlinkCount);

            Assert.Equal(CrawlJobStatus.Completed, hcs.State.Status);
        }
        public async Task ShouldStopProcessingViaCancellation()
        {
            var totalHyperlinkCount = 1;

            async Task <HttpResponseMessage> getRandomHtml(HttpRequestMessage message)
            {
                await Task.Delay(100 *new Random().Next(1, 5));

                var linkCount = new Random().Next(1, 5);

                Interlocked.Add(ref totalHyperlinkCount, linkCount);

                return(new HttpResponseMessage(HttpStatusCode.OK)
                {
                    Content = _html.GetHtmlWithHyperlinks(linkCount)
                });
            }

            var messageHandler = new MockHttpMessageHandler();
            var httpClient     = messageHandler.ToHttpClient();

            messageHandler.When("http://host.com/*")
            .Respond(async m => await getRandomHtml(m));

            var settings = new HyperlinkCrawlerSettings();
            var request  = new HyperlinkCrawlRequest("http://host.com/page");
            var hcs      = new HyperlinkCrawlerStrategy(settings, httpClient, _logger, request);
            var cts      = new CancellationTokenSource();
            await hcs.ProcessRequestAsync(request, cts.Token);

            await Task.Delay(TimeSpan.FromSeconds(1));

            Assert.True(hcs.State.ProcessedUriCount > 1);

            cts.Cancel();
            await hcs.Completion;

            Assert.True(hcs.State.ProcessedUriCount < totalHyperlinkCount);

            Assert.Equal(CrawlJobStatus.Cancelled, hcs.State.Status);
        }
Exemplo n.º 6
0
        public async Task ShouldExecuteEnqueuedCrawlRequest()
        {
            var request = new HyperlinkCrawlRequest(_crawlUri)
            {
                MaxDepth = 5
            };
            var hangfire = HangfireTestUtil.SetupHangfire();

            using (var _ = hangfire.StartServer())
            {
                hangfire.BackgroundJobClient.Enqueue <CrawlJobPerformer>(p =>
                                                                         p.Perform(request));

                await Task.Delay(TimeSpan.FromSeconds(2));
            }

            hangfire.Strategy.Verify(s => s.ProcessRequestAsync(
                                         It.IsAny <CrawlRequestBase>(),
                                         It.IsAny <CancellationToken>()),
                                     Times.Once);
        }
Exemplo n.º 7
0
        public void ShouldExecuteCrawlRequest()
        {
            var request = new HyperlinkCrawlRequest(_crawlUri)
            {
                MaxDepth = 5
            };
            var hangfire = HangfireTestUtil.SetupHangfire();
            var job      = Job.FromExpression((CrawlJobPerformer p) =>
                                              p.Perform(request));
            var backgroundJob = new BackgroundJob(
                Guid.NewGuid().ToString("N"),
                job,
                DateTime.UtcNow);

            hangfire.Perform(backgroundJob);

            hangfire.Strategy.Verify(s => s.ProcessRequestAsync(
                                         It.IsAny <CrawlRequestBase>(),
                                         It.IsAny <CancellationToken>()),
                                     Times.Once);
        }
        public async Task ShouldProcessAllLinksIgnoringDuplicates()
        {
            HttpContent getRandomHtml(HttpRequestMessage _)
            {
                return(_html.GetHtmlWithDuplicateHyperlinks(6));
            }

            var messageHandler = new MockHttpMessageHandler();

            messageHandler.When("http://host.com/*")
            .Respond(m => getRandomHtml(m));

            var httpClient = messageHandler.ToHttpClient();

            // determines the TimeSpan that auto-completion detection awaits - allowing
            // for any additional heartbeats that signal there are still URIs being
            // processed. In a non-unit-test, real crawler scenario, this should be set
            // to a value that allows for processing URIs over an internet connection,
            // ie. accounts for latency in dns resolution and server response times
            httpClient.Timeout = TimeSpan.FromMilliseconds(300);


            var settings = new HyperlinkCrawlerSettings();
            var request  = new HyperlinkCrawlRequest("http://host.com/page");
            var hcs      = new HyperlinkCrawlerStrategy(settings, httpClient, _logger, request);

            // act
            await hcs.ProcessRequestAsync(request, CancellationToken.None);

            await hcs.Completion;

            // should only discover 1 distinct URI since all 6 are exact
            Assert.Equal(1, hcs.State.DiscoveredUriCount);

            // should process 2 URIs; the original and the 1 discovered
            Assert.Equal(2, hcs.State.ProcessedUriCount + hcs.State.FailedUriCount);
        }
Exemplo n.º 9
0
        public async Task ShouldBeAbleToCheckStatus()
        {
            bool onOrleansTaskScheduler   = false,
                 onTheadPoolTaskScheduler = false;

            // setup mocks for DI
            ConfigureContainer = builder =>
            {
                var strategyMock = new Mock <ICrawlerStrategy>();
                strategyMock.Setup(strategy => strategy.ProcessRequestAsync(
                                       It.IsAny <CrawlRequestBase>(),
                                       It.IsAny <CancellationToken>()))
                .Returns(async(CrawlRequestBase req, Func <CrawlStateSnapshot, Task> saveProgress, CancellationToken token) =>
                {
                    // mock that we're now processing
                    await saveProgress(new HyperlinkCrawlStateSnapshot(CrawlJobStatus.Running));

                    // assert that we are in the Orleans TaskScheduler
                    onOrleansTaskScheduler = TaskScheduler.Default != TaskScheduler.Current;

                    // simulates Dataflow blocks by running on ThreadPool threads
                    // Task.Run automatically uses the ThreadPool TaskScheduler
                    await Task.Run(async() =>
                    {
                        onTheadPoolTaskScheduler = TaskScheduler.Default == TaskScheduler.Current;
                        await Task.Delay(TimeSpan.FromSeconds(5));
                    });
                });

                var strategyFactoryMock = new Mock <ICrawlerStrategyFactory>();
                strategyFactoryMock.Setup(factory => factory.Create(It.IsAny <CrawlRequestBase>()))
                .Returns(strategyMock.Object);

                builder.RegisterInstance(strategyMock.Object)
                .As <ICrawlerStrategy>();

                builder.RegisterInstance(strategyFactoryMock.Object)
                .As <ICrawlerStrategyFactory>();
            };

            var clusterBuilder = new TestClusterBuilder(1);

            clusterBuilder.AddClientBuilderConfigurator <TestSiloBuilder>();
            clusterBuilder.AddSiloBuilderConfigurator <TestSiloBuilder>();
            _cluster = clusterBuilder.Build();
            _cluster.Deploy();

            var request = new HyperlinkCrawlRequest("http://host.com/");

            // Enqueue request from client
            var manager = _cluster.Client.GetGrain <IManageCrawlRequests>(request.Id);
            var status  = await manager.Enqueue(request);

            Assert.Equal(CrawlJobStatus.Pending, status);

            // simulate Silo/Hangfire Worker thread (ie. non-threadpool thread)
            // don't await since we want to test that we can re-enter the Grain to check
            // the status of the job
            var processing = Task.Factory.StartNew(async() =>
            {
                var crawler = _cluster.GrainFactory.GetGrain <IProcessCrawlRequests>(request.Id);
                await crawler.Process(request);
            },
                                                   TaskCreationOptions.LongRunning).Unwrap();

            // allow the processing task to start working
            await Task.Delay(TimeSpan.FromSeconds(1));

            status = await manager.AsReference <IProcessCrawlRequests>().GetStatus();

            await processing; // ~4 seconds; see Task.Delay in mock strategy above - 1 second

            Assert.Equal(CrawlJobStatus.Running, status);

            // assert that ICrawlerStrategy.ProcessRequestAsync runs on Orleans TaskScheduler
            Assert.True(onOrleansTaskScheduler);

            // assert that Dataflow blocks use ThreadPool threads;
            // escaping single-threaded execution model
            Assert.True(onTheadPoolTaskScheduler);
        }