Exemplo n.º 1
0
        /// <summary>
        /// Process page by
        /// - skipping if already visited/processed
        /// - skipping if invalid (i.e. leaves main domain)
        /// - archive contents
        /// - recursively process links
        /// </summary>
        /// <param name="uri"></param>
        /// <returns></returns>
        public async Task ProcessPage(Uri uri)
        {
            var pageUri = UriTracker.TrackUri(UriMapper.MapUri(uri));

            if (pageUri == null)
            {
                Logger.LogDebug($"skipped uri: {uri}");
                // uri is invalid, already visited or in progress
                return;
            }

            // Fetch page
            var pageContent = await PageLoader.DownloadPage(pageUri);

            // parallell: Ensure page is saved
            var saveCompletion = PageArchive.SavePage(pageUri, pageContent);
            // paralell: Process links
            var crawlCompletion = ProcessLinks(pageUri, pageContent);

            // wait for all parallell
            await Task.WhenAll(saveCompletion, crawlCompletion);
        }
Exemplo n.º 2
0
 public Task ProcessPage(string link)
 {
     return(ProcessPage(UriMapper.MapLink(link)));
 }