Esempio n. 1
1
		public async Task ScrapeAsync (IProgress <IProgressReportable> progress)
		{
			Uri host;
			var runningTasks = new List <Task> ();

			lock (options)
			{
				host = new Uri (options.Uri);
				var start = new LinkItem (options.Uri, null); // Provide a starting scrape task

				finishedLinkItems.Add (start);
				Interlocked.Increment (ref totalLinkItems);
				runningTasks.Add (DoScrapeAsync (start, options.HRefXPathExpression, options.ImageXPathExpression, host, progress));
			}

			// Use a do-while loop so that the loop executes at least once
			do
			{
				runningTasks.Remove (await Task.WhenAny (runningTasks));

				lock (options)
				{
					while (queuedLinkItems.Any () && (runningTasks.Count < options.MaxConcurrentOperations))
					{
						LinkItem link;
						if (!queuedLinkItems.TryDequeue (out link)) continue;
						finishedLinkItems.Add (link); // Add the dequeued link to the finished item bag as soon as possible
						runningTasks.Add (DoScrapeAsync (link, options.HRefXPathExpression, options.ImageXPathExpression, host, progress));
					}
				}
			}
			while (runningTasks.Any ());
		}
Esempio n. 2
0
		protected async Task <Page> ScrapePageAsync (LinkItem link, string hRefXPath, string imageXPath, Uri host, IProgress <IProgressReportable> progress)
		{
			var document = new HtmlDocument ();
			try
			{
				using (var client = new WebClient ()) document.LoadHtml (await client.DownloadStringTaskAsync (new Uri (link.HRef)));
			}
			catch (WebException)
			{
				return null;
			}
			var linkItems = ScrapeLinkItems (document.DocumentNode.SelectNodes (hRefXPath), host);
			var imageItems = ScrapeImageItems (document.DocumentNode.SelectNodes (imageXPath), host);
			QueueLinksForScraping (linkItems);
			progress?.Report (new PageProgressReport (finishedLinkItems.Count, Interlocked.Read (ref totalLinkItems)));
			return new Page (linkItems, imageItems);
		}
Esempio n. 3
0
		protected async Task DoScrapeAsync (LinkItem link, string hRefXPath, string imageXPath, Uri host, IProgress <IProgressReportable> progress)
		{
			var page = await ScrapePageAsync (link, hRefXPath, imageXPath, host, progress);
			if (page == null) return;
			await SaveImagesAsync (await ScrapeImagesAsync (page, host, progress), progress);
		}