Пример #1
0
        private async Task ScanPageAsync(
            Uri currentUrl,
            int currentDepth,
            Uri baseUrl,
            ConcurrentDictionary <Uri, byte> urlsNotToScan,
            ConcurrentDictionary <Uri, int> urlsToScan,
            ConcurrentDictionary <Uri, byte> scannedUrls,
            int maxLinksOnPageCount)
        {
            var page = await _downloader.DownloadPageAsync(currentUrl);

            var parsedHtml = HtmlParser.ParseHtml(page, baseUrl);

            var filteredLinks = _urlFilter.Filter(parsedHtml.Links, baseUrl);

            filteredLinks
            .Where(url => !scannedUrls.ContainsKey(url))
            .Where(url => !urlsNotToScan.ContainsKey(url))
            .Take(maxLinksOnPageCount)
            .ForEach(uri => urlsToScan.TryAdd(uri, currentDepth + 1));

#pragma warning disable 4014
            _database.InsertAsync(new ScannedPage(currentUrl, parsedHtml.Text));
#pragma warning restore 4014
            scannedUrls.TryAdd(currentUrl, default);
        }
Пример #2
0
        public async Task <ScannedPage[]> SearchAsync(Uri domainUrl)
        {
            var pages = await _database.GetAllAsync();

            var urls         = pages.Select(page => page.Url);
            var filteredUrls = _urlFilter.Filter(urls, domainUrl, true);

            return(pages
                   .Where(page => filteredUrls.Contains(page.Url))
                   .ToArray());
        }
        public void SanitisesUrls()
        {
            List <string> inputUrls = new List <string>()
            {
                "/first/",
                "/SECOND",
                "http://MoNZO.com/third",
                "https://MoNZO.com/fourth",
                "http://WwW.MoNZO.com/fifth",
                "https://WwW.MoNZO.com/sixth"
            };

            var filteredUrls = _urlFilter.Filter("monzo.com", inputUrls);

            filteredUrls.Count.ShouldBe(6);
            filteredUrls[0].ShouldBe("/first");
            filteredUrls[1].ShouldBe("/second");
            filteredUrls[2].ShouldBe("/third");
            filteredUrls[3].ShouldBe("/fourth");
            filteredUrls[4].ShouldBe("/fifth");
            filteredUrls[5].ShouldBe("/sixth");
        }