Beispiel #1
0
 public void StartWorkers(
     PauseToken pauseToken,
     WorkerRelevantJobData jobData,
     ConcurrentQueue <Uri> queue,
     ConcurrentQueue <string> htmlQueue,
     ConcurrentDictionary <Uri, CrawlData> crawled)
 {
     foreach (var worker in _workers)
     {
         worker.Start(
             _browser,
             _cancellationTokenSource.Token,
             pauseToken,
             jobData,
             queue,
             htmlQueue,
             crawled);
     }
 }
Beispiel #2
0
        public void Start(Browser browser,
                          CancellationToken cancellationToken,
                          PauseToken pauseToken,
                          WorkerRelevantJobData jobData,
                          ConcurrentQueue <Uri> queue,
                          ConcurrentQueue <string> htmlQueue,
                          ConcurrentDictionary <Uri, CrawlData> crawled)
        {
            Thread = new Thread(
                async() => await WorkAction(
                    browser,
                    cancellationToken,
                    pauseToken,
                    jobData,
                    queue,
                    crawled));

            Thread.Start();
            _tab = browser.NewPageAsync().ConfigureAwait(false).GetAwaiter().GetResult();
        }
Beispiel #3
0
        private async Task WorkAction(
            Browser browser,
            CancellationToken cancellationToken,
            PauseToken pauseToken,
            WorkerRelevantJobData jobData,
            ConcurrentQueue <Uri> queue,
            ConcurrentDictionary <Uri, CrawlData> crawled)
        {
            // create a new tab on the browser for this thread

            var nextUri = DequeueOrRetry(queue, cancellationToken);

            if (cancellationToken.IsCancellationRequested)
            {
                return;
            }

            // go to page
            var response = await _tab.GoToAsync(nextUri.AbsolutePath);

            if (response.Ok)
            {
                // perform the jobs actions
                // each action could yield a return value, such as extracted data
                // the url should be added to the crawl collection
            }
            else
            {
                // indicate in the crawled collection this was a failure + reason
                crawled.TryAdd(nextUri, CrawlData.CreateFailureData(response.Status, response.StatusText));
            }

            // if we should look for some links on the page
            if (jobData.LinkEnqueueType != LinkEnqueueType.None)
            {
                // get the page content and just put it in a collection
                // parser group will sort through and add the links
            }
        }
Beispiel #4
0
        public Task <ProgressResult> Start(
            JobDTO job,
            CancellationToken cancellationToken,
            PauseToken pauseToken,
            ProgressToken progressToken)
        {
            if (IsCrawling)
            {
                throw new CrawlerAlreadyRunningException();
            }

            return(Task.Run(() =>
            {
                var workerRelevantJobData = WorkerRelevantJobData.FromJobDTO(job);
                // create worker groups (max 64 threads, could give them each a browser, can give them each a set of domains to work on)
                var group = new WorkerGroup(
                    _browser,
                    WorkersPerGroup);

                var uriQueue = new ConcurrentQueue <Uri>(job.Seeds);
                var htmlQueue = new ConcurrentQueue <string>();
                var crawled = new ConcurrentDictionary <Uri, CrawlData>();

                group.StartWorkers(
                    pauseToken,
                    workerRelevantJobData,
                    uriQueue,
                    crawled);

                IsCrawling = true;

                // main thread will spin here, checking for cancellation, or progress requests
                // quit if the queue is empty and no one seems to be adding to it?
                // don't want to just quit in case the queue is empty because there might be something about to be added
                // but it's just taking a long time
                while (!cancellationToken.IsCancellationRequested /* TODO add other stop conditions such as max crawl time */)
                {
                    // cancellation and pauses are passed down to worker groups
                    // progress is checked via the collections and then updated
                    if (progressToken.ProgressIsRequested)
                    {
                        progressToken.State = GetState(uriQueue, crawled);
                    }
                    Thread.Sleep(10);
                }

                // tell the worker group to stop
                group.Cancel();
                var start = DateTime.Now;

                // after cancellation is requested we fall into another loop, this has a time limit, so if not all worker groups are done
                while (!group.AllDone && (DateTime.Now - start) < AbortWorkersTimeout)
                {
                    // notify of which threads are holding us up
                    Thread.Sleep(10);
                }

                // attempt to properly join the threads
                if (group.AllDone)
                {
                    group.DiposeWorkers();
                }
                // abort them if not done
                else
                {
                    group.AbortWorkers();
                }

                IsCrawling = false;

                return GetResult(uriQueue, crawled);
            }));
        }