public async Task ScrapeAsync (IProgress <IProgressReportable> progress) { Uri host; var runningTasks = new List <Task> (); lock (options) { host = new Uri (options.Uri); var start = new LinkItem (options.Uri, null); // Provide a starting scrape task finishedLinkItems.Add (start); Interlocked.Increment (ref totalLinkItems); runningTasks.Add (DoScrapeAsync (start, options.HRefXPathExpression, options.ImageXPathExpression, host, progress)); } // Use a do-while loop so that the loop executes at least once do { runningTasks.Remove (await Task.WhenAny (runningTasks)); lock (options) { while (queuedLinkItems.Any () && (runningTasks.Count < options.MaxConcurrentOperations)) { LinkItem link; if (!queuedLinkItems.TryDequeue (out link)) continue; finishedLinkItems.Add (link); // Add the dequeued link to the finished item bag as soon as possible runningTasks.Add (DoScrapeAsync (link, options.HRefXPathExpression, options.ImageXPathExpression, host, progress)); } } } while (runningTasks.Any ()); }
protected async Task <Page> ScrapePageAsync (LinkItem link, string hRefXPath, string imageXPath, Uri host, IProgress <IProgressReportable> progress) { var document = new HtmlDocument (); try { using (var client = new WebClient ()) document.LoadHtml (await client.DownloadStringTaskAsync (new Uri (link.HRef))); } catch (WebException) { return null; } var linkItems = ScrapeLinkItems (document.DocumentNode.SelectNodes (hRefXPath), host); var imageItems = ScrapeImageItems (document.DocumentNode.SelectNodes (imageXPath), host); QueueLinksForScraping (linkItems); progress?.Report (new PageProgressReport (finishedLinkItems.Count, Interlocked.Read (ref totalLinkItems))); return new Page (linkItems, imageItems); }
protected async Task DoScrapeAsync (LinkItem link, string hRefXPath, string imageXPath, Uri host, IProgress <IProgressReportable> progress) { var page = await ScrapePageAsync (link, hRefXPath, imageXPath, host, progress); if (page == null) return; await SaveImagesAsync (await ScrapeImagesAsync (page, host, progress), progress); }