/// <summary> /// Process page by /// - skipping if already visited/processed /// - skipping if invalid (i.e. leaves main domain) /// - archive contents /// - recursively process links /// </summary> /// <param name="uri"></param> /// <returns></returns> public async Task ProcessPage(Uri uri) { var pageUri = UriTracker.TrackUri(UriMapper.MapUri(uri)); if (pageUri == null) { Logger.LogDebug($"skipped uri: {uri}"); // uri is invalid, already visited or in progress return; } // Fetch page var pageContent = await PageLoader.DownloadPage(pageUri); // parallell: Ensure page is saved var saveCompletion = PageArchive.SavePage(pageUri, pageContent); // paralell: Process links var crawlCompletion = ProcessLinks(pageUri, pageContent); // wait for all parallell await Task.WhenAll(saveCompletion, crawlCompletion); }
public Task ProcessPage(string link) { return(ProcessPage(UriMapper.MapLink(link))); }