コード例 #1
0
ファイル: Scraper.cs プロジェクト: ParalaxRus/WebCrawler
        private void SlowSequentialDownload(Dictionary <int, Uri> htmlMap,
                                            Settings settings,
                                            BlockingCollection <string> queue)
        {
            var random = new Random();

            var samples = this.Take(htmlMap, settings.Count);

            for (int i = 0; (i < samples.Count) && !this.cancellationToken.IsCancellationRequested; ++i)
            {
                var uri = samples[i];

                // Sleeping until delay happens or user cancellation
                var  delay       = random.Next(settings.MinDelay, settings.MaxDelay + 1);
                bool isCancelled = this.cancellationToken.WaitHandle.WaitOne(delay * 1000);
                if (isCancelled)
                {
                    break;
                }

                var file = Path.Join(this.htmlDownloadPath, uri.LocalPath);
                Directory.CreateDirectory(Path.GetDirectoryName(file));

                if (UriDownload.Download(uri, file))
                {
                    // Producer: adding downloaded file
                    queue.Add(file);
                }

                if (this.reportProgress != null)
                {
                    this.reportProgress((i + 1) / (double)settings.Count);
                }
            }
        }
コード例 #2
0
        /// <summary>Download sitemap file, parses it and extracts https links
        /// to scrape and/or another sitemap files.</summary>
        private HashSet <Uri>[] DownloadAndParse(Uri sitemap)
        {
            // Sitemap file can be index file ir sitemap file. Index file contains uri of another
            // sitemap files and sitemap file contains site structure urls
            // However following logic allows to account for hybrid/mixed files

            var urls = new HashSet <Uri>[]
            {
                new HashSet <Uri>(), // Urls to more index files
                new HashSet <Uri>(), // Concrete urls
            };

            // File on disk to download sitemap file to
            var fileOnDisk = Path.Join(this.rootPath, sitemap.LocalPath);

            Directory.CreateDirectory(Path.GetDirectoryName(fileOnDisk));

            try
            {
                if (!UriDownload.Download(sitemap, fileOnDisk))
                {
                    // Nothing could be retrieved
                    return(urls);
                }

                // Sitemap file could be compressed
                if (GZip.IsGZip(fileOnDisk))
                {
                    string tmp = fileOnDisk + "." + Guid.NewGuid().ToString();

                    GZip.Decompress(fileOnDisk, tmp);
                    File.Move(tmp, fileOnDisk, true);
                }

                this.Parse(fileOnDisk, urls[0], urls[1]);
            }
            finally
            {
                if (!this.saveSitemapFiles && File.Exists(fileOnDisk))
                {
                    // No need to keep sitemap file on disk after processing
                    File.Delete(fileOnDisk);
                }
            }

            return(urls);
        }