public void RelativePathsToNewPages() { var baseUri = new Uri("http://www.example.com"); var testUrl = new List <string> { "", "/books/5", "/books/5&page=5", "/books/5/page/10", "/books/5#description" }; var expectedUrl = new List <string> { "/books/5", "/books/5&page=5", "/books/5/page/10" }; var actualUrl = urlFilter.RemoveUnnecessary(testUrl, baseUri); Assert.Equal(expectedUrl.Count, actualUrl.Count); Assert.Collection(actualUrl, item => Assert.Equal(expectedUrl[0], item.PathAndQuery), item => Assert.Equal(expectedUrl[1], item.PathAndQuery), item => Assert.Equal(expectedUrl[2], item.PathAndQuery)); }
public async Task <RecordDto> CrawlWebsiteAsync(string startUrl, BlockingCollection <RecordItemDto> blockingCollection, CancellationToken cancellationToken) { if (!Uri.IsWellFormedUriString(startUrl, UriKind.Absolute)) { throw new UriFormatException("Url in not valid"); } var startUri = new Uri(startUrl, UriKind.Absolute); var stopwath = new Stopwatch(); var visitedUri = new List <Uri>(); var pagesToBeCalled = new Queue <Uri>(); var result = new RecordDto(); pagesToBeCalled.Enqueue(startUri); // TODO: Extract to factory service result.RequestedUrl = startUrl; result.RecordCreated = DateTime.UtcNow; result.Items = new List <RecordItemDto>(); while (pagesToBeCalled.Any() && !cancellationToken.IsCancellationRequested) { var recordItem = new RecordItemDto(); var currentUri = pagesToBeCalled.Dequeue(); stopwath.Reset(); stopwath.Start(); var requestResult = await client.GetAsync(currentUri.AbsoluteUri); var content = await requestResult.Content.ReadAsStringAsync(); stopwath.Stop(); visitedUri.Add(currentUri); recordItem.RequestUrl = currentUri.AbsolutePath; recordItem.RequestTime = stopwath.Elapsed; result.Items.Add(recordItem); blockingCollection.Add(recordItem); var parsedUrls = htmlParser.GetUrlsFromHtmlATag(content); var filteredPaths = urlFilter.RemoveUnnecessary(parsedUrls, currentUri); foreach (var parsedUri in filteredPaths) { if (!visitedUri.Contains(parsedUri) && !pagesToBeCalled.Contains(parsedUri)) { pagesToBeCalled.Enqueue(parsedUri); } } } blockingCollection.CompleteAdding(); return(result); }