private void SlowSequentialDownload(Dictionary <int, Uri> htmlMap, Settings settings, BlockingCollection <string> queue) { var random = new Random(); var samples = this.Take(htmlMap, settings.Count); for (int i = 0; (i < samples.Count) && !this.cancellationToken.IsCancellationRequested; ++i) { var uri = samples[i]; // Sleeping until delay happens or user cancellation var delay = random.Next(settings.MinDelay, settings.MaxDelay + 1); bool isCancelled = this.cancellationToken.WaitHandle.WaitOne(delay * 1000); if (isCancelled) { break; } var file = Path.Join(this.htmlDownloadPath, uri.LocalPath); Directory.CreateDirectory(Path.GetDirectoryName(file)); if (UriDownload.Download(uri, file)) { // Producer: adding downloaded file queue.Add(file); } if (this.reportProgress != null) { this.reportProgress((i + 1) / (double)settings.Count); } } }
/// <summary>Download sitemap file, parses it and extracts https links /// to scrape and/or another sitemap files.</summary> private HashSet <Uri>[] DownloadAndParse(Uri sitemap) { // Sitemap file can be index file ir sitemap file. Index file contains uri of another // sitemap files and sitemap file contains site structure urls // However following logic allows to account for hybrid/mixed files var urls = new HashSet <Uri>[] { new HashSet <Uri>(), // Urls to more index files new HashSet <Uri>(), // Concrete urls }; // File on disk to download sitemap file to var fileOnDisk = Path.Join(this.rootPath, sitemap.LocalPath); Directory.CreateDirectory(Path.GetDirectoryName(fileOnDisk)); try { if (!UriDownload.Download(sitemap, fileOnDisk)) { // Nothing could be retrieved return(urls); } // Sitemap file could be compressed if (GZip.IsGZip(fileOnDisk)) { string tmp = fileOnDisk + "." + Guid.NewGuid().ToString(); GZip.Decompress(fileOnDisk, tmp); File.Move(tmp, fileOnDisk, true); } this.Parse(fileOnDisk, urls[0], urls[1]); } finally { if (!this.saveSitemapFiles && File.Exists(fileOnDisk)) { // No need to keep sitemap file on disk after processing File.Delete(fileOnDisk); } } return(urls); }