// by Noseratio - http://stackoverflow.com/a/22262976/1768303 // main logic static async Task ScrapSitesAsync(string[] urls, CancellationToken token) { using (var apartment = new MessageLoopApartment()) { // create WebBrowser inside MessageLoopApartment var webBrowser = apartment.Invoke(() => new WebBrowser()); try { foreach (var url in urls) { Console.WriteLine("URL:\n" + url); // cancel in 30s or when the main token is signalled var navigationCts = CancellationTokenSource.CreateLinkedTokenSource(token); navigationCts.CancelAfter((int)TimeSpan.FromSeconds(30).TotalMilliseconds); var navigationToken = navigationCts.Token; // run the navigation task inside MessageLoopApartment string html = await apartment.Run(() => webBrowser.NavigateAsync(url, navigationToken), navigationToken); Console.WriteLine("HTML:\n" + html); } } finally { // dispose of WebBrowser inside MessageLoopApartment apartment.Invoke(() => webBrowser.Dispose()); } } }
// Navigate to a site and get a snapshot of its DOM HTML public async Task <string> ScrapSiteAsync(string url, int timeout, CancellationToken token = default(CancellationToken)) { var navigationCts = CancellationTokenSource.CreateLinkedTokenSource(token, _cts.Token); var combinedToken = navigationCts.Token; // we have a limited number of WebBrowser objects available, so await the semaphore await _semaphore.WaitAsync(combinedToken); try { if (timeout != Timeout.Infinite) { navigationCts.CancelAfter(timeout); } // run the main logic on the STA thread return(await _apartment.Run(async() => { // acquire the 1st available WebBrowser from the pool var webBrowser = _browsers.Dequeue(); try { var task = webBrowser.NavigateAsync(url, combinedToken); _pendingTasks.Add(task); // register the pending task try { return await task; } finally { // unregister the completed task _pendingTasks.Remove(task); } } finally { // return the WebBrowser to the pool _browsers.Enqueue(webBrowser); } }, combinedToken)); } finally { _semaphore.Release(); } }