public void StartWorkers( PauseToken pauseToken, WorkerRelevantJobData jobData, ConcurrentQueue <Uri> queue, ConcurrentQueue <string> htmlQueue, ConcurrentDictionary <Uri, CrawlData> crawled) { foreach (var worker in _workers) { worker.Start( _browser, _cancellationTokenSource.Token, pauseToken, jobData, queue, htmlQueue, crawled); } }
public void Start(Browser browser, CancellationToken cancellationToken, PauseToken pauseToken, WorkerRelevantJobData jobData, ConcurrentQueue <Uri> queue, ConcurrentQueue <string> htmlQueue, ConcurrentDictionary <Uri, CrawlData> crawled) { Thread = new Thread( async() => await WorkAction( browser, cancellationToken, pauseToken, jobData, queue, crawled)); Thread.Start(); _tab = browser.NewPageAsync().ConfigureAwait(false).GetAwaiter().GetResult(); }
private async Task WorkAction( Browser browser, CancellationToken cancellationToken, PauseToken pauseToken, WorkerRelevantJobData jobData, ConcurrentQueue <Uri> queue, ConcurrentDictionary <Uri, CrawlData> crawled) { // create a new tab on the browser for this thread var nextUri = DequeueOrRetry(queue, cancellationToken); if (cancellationToken.IsCancellationRequested) { return; } // go to page var response = await _tab.GoToAsync(nextUri.AbsolutePath); if (response.Ok) { // perform the jobs actions // each action could yield a return value, such as extracted data // the url should be added to the crawl collection } else { // indicate in the crawled collection this was a failure + reason crawled.TryAdd(nextUri, CrawlData.CreateFailureData(response.Status, response.StatusText)); } // if we should look for some links on the page if (jobData.LinkEnqueueType != LinkEnqueueType.None) { // get the page content and just put it in a collection // parser group will sort through and add the links } }
public Task <ProgressResult> Start( JobDTO job, CancellationToken cancellationToken, PauseToken pauseToken, ProgressToken progressToken) { if (IsCrawling) { throw new CrawlerAlreadyRunningException(); } return(Task.Run(() => { var workerRelevantJobData = WorkerRelevantJobData.FromJobDTO(job); // create worker groups (max 64 threads, could give them each a browser, can give them each a set of domains to work on) var group = new WorkerGroup( _browser, WorkersPerGroup); var uriQueue = new ConcurrentQueue <Uri>(job.Seeds); var htmlQueue = new ConcurrentQueue <string>(); var crawled = new ConcurrentDictionary <Uri, CrawlData>(); group.StartWorkers( pauseToken, workerRelevantJobData, uriQueue, crawled); IsCrawling = true; // main thread will spin here, checking for cancellation, or progress requests // quit if the queue is empty and no one seems to be adding to it? // don't want to just quit in case the queue is empty because there might be something about to be added // but it's just taking a long time while (!cancellationToken.IsCancellationRequested /* TODO add other stop conditions such as max crawl time */) { // cancellation and pauses are passed down to worker groups // progress is checked via the collections and then updated if (progressToken.ProgressIsRequested) { progressToken.State = GetState(uriQueue, crawled); } Thread.Sleep(10); } // tell the worker group to stop group.Cancel(); var start = DateTime.Now; // after cancellation is requested we fall into another loop, this has a time limit, so if not all worker groups are done while (!group.AllDone && (DateTime.Now - start) < AbortWorkersTimeout) { // notify of which threads are holding us up Thread.Sleep(10); } // attempt to properly join the threads if (group.AllDone) { group.DiposeWorkers(); } // abort them if not done else { group.AbortWorkers(); } IsCrawling = false; return GetResult(uriQueue, crawled); })); }