private async Task ScanPageAsync( Uri currentUrl, int currentDepth, Uri baseUrl, ConcurrentDictionary <Uri, byte> urlsNotToScan, ConcurrentDictionary <Uri, int> urlsToScan, ConcurrentDictionary <Uri, byte> scannedUrls, int maxLinksOnPageCount) { var page = await _downloader.DownloadPageAsync(currentUrl); var parsedHtml = HtmlParser.ParseHtml(page, baseUrl); var filteredLinks = _urlFilter.Filter(parsedHtml.Links, baseUrl); filteredLinks .Where(url => !scannedUrls.ContainsKey(url)) .Where(url => !urlsNotToScan.ContainsKey(url)) .Take(maxLinksOnPageCount) .ForEach(uri => urlsToScan.TryAdd(uri, currentDepth + 1)); #pragma warning disable 4014 _database.InsertAsync(new ScannedPage(currentUrl, parsedHtml.Text)); #pragma warning restore 4014 scannedUrls.TryAdd(currentUrl, default); }
public async Task <ScannedPage[]> SearchAsync(Uri domainUrl) { var pages = await _database.GetAllAsync(); var urls = pages.Select(page => page.Url); var filteredUrls = _urlFilter.Filter(urls, domainUrl, true); return(pages .Where(page => filteredUrls.Contains(page.Url)) .ToArray()); }
public void SanitisesUrls() { List <string> inputUrls = new List <string>() { "/first/", "/SECOND", "http://MoNZO.com/third", "https://MoNZO.com/fourth", "http://WwW.MoNZO.com/fifth", "https://WwW.MoNZO.com/sixth" }; var filteredUrls = _urlFilter.Filter("monzo.com", inputUrls); filteredUrls.Count.ShouldBe(6); filteredUrls[0].ShouldBe("/first"); filteredUrls[1].ShouldBe("/second"); filteredUrls[2].ShouldBe("/third"); filteredUrls[3].ShouldBe("/fourth"); filteredUrls[4].ShouldBe("/fifth"); filteredUrls[5].ShouldBe("/sixth"); }