Пример #1
0
        public async Task <CrawlReport> Crawl(CrawlJob job)
        {
            if (job == null)
            {
                throw new ArgumentNullException(nameof(job));
            }

            IsRunning = true;

            try
            {
                Setup(job.SeedUris, job.Cookies);

                // try and parse the robots.txt file of the domain and add any disallowed links to a read only collection
                _disallowedUrls = await RobotParser.GetDisallowedUrls(_webAgent, job.Domain.Host);

                // quit early as we are not allowed to go on this domain
                if (_disallowedUrls.Contains("/"))
                {
                    return(GetCrawlReport());
                }

                // create the allowed amount of threads for the job
                var threadsAndDoneEvents = CreateThreads(job);

                _startTime  = DateTime.Now;
                _lastUpdate = _startTime;

                // hold all but one thread in a pattern until there is work for them
                // start the first thread off, with the job of parsing the domain page provided
                foreach (var thread in threadsAndDoneEvents.Item1)
                {
                    thread.Start();
                }

                // wait for done events
                WaitHandle.WaitAll(threadsAndDoneEvents.Item2.ToArray());

                // flush queues and return the list of data found during the crawl
                foreach (var thread in threadsAndDoneEvents.Item1)
                {
                    if (thread.ThreadState == ThreadState.Running)
                    {
                        thread.Join();
                    }
                }

                return(GetCrawlReport());
            }
            catch (Exception e)
            {
                throw new YACException("Exception thrown from YAC", e);
            }
            finally
            {
                IsRunning = false;
            }
        }
Пример #2
0
        private Tuple <List <Thread>, List <ManualResetEvent> > CreateThreads(CrawlJob job)
        {
            var threads = new List <Thread>();
            var events  = new List <ManualResetEvent>();

            for (int i = 0; i < Math.Max(1, job.ThreadAllowance); i++)
            {
                var doneEvent = new ManualResetEvent(false);
                var id        = i;
                var thread    = new Thread(async() => await ThreadAction(new Worker(id, doneEvent), job))
                {
                    Name = "YAC Worker"
                };

                events.Add(doneEvent);
                threads.Add(thread);
            }

            return(new Tuple <List <Thread>, List <ManualResetEvent> >(threads, events));
        }
Пример #3
0
        private async Task ThreadAction(IWorker worker, CrawlJob job)
        {
            // sort out multi threading holding pattern
            if (worker.Id != 0)
            {
                while (_queue.Count < (worker.Id + 1) && !_cancelSource.Token.IsCancellationRequested && !_aThreadIsComplete)
                {
                    Thread.Sleep(100);
                }
            }

            while (job.CompletionConditions.All(cc => !cc.ConditionMet(GetCrawlProgress())) &&
                   !_cancelSource.Token.IsCancellationRequested &&
                   !_aThreadIsComplete)
            {
                if (worker.Id == 0 && NeedsUpdate())
                {
                    _updateAction(GetCrawlProgress());
                }

                // set up fallback and retry policies
                var fallback = Policy <Uri> .Handle <CrawlQueueEmptyException>()
                               .Fallback((cToken) =>
                {
                    _aThreadIsComplete = true;
                    return(null);
                });

                var retry = Policy <Uri> .Handle <CrawlQueueEmptyException>()
                            .WaitAndRetry(10, tryNum => TimeSpan.FromMilliseconds(tryNum * 200));

                // will attempt to get a new item from the queue, retrying as per above policies
                var next = Policy.Wrap(fallback, retry).Execute(() =>
                {
                    var n = GetNext();

                    if (n == null)
                    {
                        throw new CrawlQueueEmptyException();
                    }

                    return(n);
                });

                // fallback will set this if we failed to get a new link (this will end the crawl)
                if (_aThreadIsComplete)
                {
                    continue;
                }

                try
                {
                    // access it
                    var responseTask = _webAgent.ExecuteRequest(next);

                    // log that we've crawled it
                    _crawled.Add(next);

                    var response = await responseTask;

                    if (response != null)
                    {
                        var html = HTMLRetriever.GetHTML(_webAgent.GetCompressedStream(response));

                        // parse the contents for new links and data user wants
                        var data = DataExtractor.Extract(html, job.Domain, job.Regex);

                        // add each of the links extracted if:
                        // the queue is not too large
                        // the link is not disallowed by the domain's robots.txt file
                        // the link is not already in the queue
                        // the link has not already been crawled
                        // each of the user defined enqueue conditions returns true
                        foreach (var link in data.Links)
                        {
                            if (_queue.Count < QUEUE_MAX &&
                                RobotParser.UriIsAllowed(_disallowedUrls, link) &&
                                !_queue.Contains(link) &&
                                !_crawled.Contains(link) &&
                                job.EnqueueConditions.All(ec => ec.ConditionMet(link)))
                            {
                                _queue.Enqueue(link);
                            }
                        }

                        // add data matching the regex to the return list
                        foreach (var foundData in data.Data)
                        {
                            _results.Add(foundData);
                        }
                    }
                }
                catch (WebException e)
                {
                    _errors.Add(e);
                }
            }

            if (!_aThreadIsComplete)
            {
                _aThreadIsComplete = true;
            }

            worker.DoneEvent.Set();
        }