Esempio n. 1
0
        public static async Task <IEnumerable <string> > GetDisallowedUrls(IWebAgent webAgent, string domain)
        {
            var uri  = new Uri($"http://{domain}/robots.txt");
            var list = new List <string>();
            var text = "";

            using (var response = await webAgent.ExecuteRequest(uri))
            {
                if ((int)response.StatusCode >= 400 || (int)response.StatusCode <= 599)
                {
                    return(list);
                }

                using (var stream = webAgent.GetCompressedStream(response))
                    using (var reader = new StreamReader(stream, Encoding.Default))
                    {
                        text = reader.ReadToEnd();
                    }
            }

            var lines = text.ToLower().Split(Environment.NewLine.ToCharArray(), StringSplitOptions.RemoveEmptyEntries);

            var name       = webAgent.AgentName.ToLower();
            var applicable = false;

            foreach (var line in lines)
            {
                if (line.Contains("user-agent"))
                {
                    applicable = line.Contains("*") || line.Contains(name);
                }

                if (line.Contains("disallow") && applicable)
                {
                    var split = line.Split(':');
                    if (split.Length < 2)
                    {
                        continue;
                    }

                    var rule = split[1].Trim();

                    list.Add(rule);
                }
            }

            return(list);
        }
Esempio n. 2
0
        private async Task ThreadAction(IWorker worker, CrawlJob job)
        {
            // sort out multi threading holding pattern
            if (worker.Id != 0)
            {
                while (_queue.Count < (worker.Id + 1) && !_cancelSource.Token.IsCancellationRequested && !_aThreadIsComplete)
                {
                    Thread.Sleep(100);
                }
            }

            while (job.CompletionConditions.All(cc => !cc.ConditionMet(GetCrawlProgress())) &&
                   !_cancelSource.Token.IsCancellationRequested &&
                   !_aThreadIsComplete)
            {
                if (worker.Id == 0 && NeedsUpdate())
                {
                    _updateAction(GetCrawlProgress());
                }

                // set up fallback and retry policies
                var fallback = Policy <Uri> .Handle <CrawlQueueEmptyException>()
                               .Fallback((cToken) =>
                {
                    _aThreadIsComplete = true;
                    return(null);
                });

                var retry = Policy <Uri> .Handle <CrawlQueueEmptyException>()
                            .WaitAndRetry(10, tryNum => TimeSpan.FromMilliseconds(tryNum * 200));

                // will attempt to get a new item from the queue, retrying as per above policies
                var next = Policy.Wrap(fallback, retry).Execute(() =>
                {
                    var n = GetNext();

                    if (n == null)
                    {
                        throw new CrawlQueueEmptyException();
                    }

                    return(n);
                });

                // fallback will set this if we failed to get a new link (this will end the crawl)
                if (_aThreadIsComplete)
                {
                    continue;
                }

                try
                {
                    // access it
                    var responseTask = _webAgent.ExecuteRequest(next);

                    // log that we've crawled it
                    _crawled.Add(next);

                    var response = await responseTask;

                    if (response != null)
                    {
                        var html = HTMLRetriever.GetHTML(_webAgent.GetCompressedStream(response));

                        // parse the contents for new links and data user wants
                        var data = DataExtractor.Extract(html, job.Domain, job.Regex);

                        // add each of the links extracted if:
                        // the queue is not too large
                        // the link is not disallowed by the domain's robots.txt file
                        // the link is not already in the queue
                        // the link has not already been crawled
                        // each of the user defined enqueue conditions returns true
                        foreach (var link in data.Links)
                        {
                            if (_queue.Count < QUEUE_MAX &&
                                RobotParser.UriIsAllowed(_disallowedUrls, link) &&
                                !_queue.Contains(link) &&
                                !_crawled.Contains(link) &&
                                job.EnqueueConditions.All(ec => ec.ConditionMet(link)))
                            {
                                _queue.Enqueue(link);
                            }
                        }

                        // add data matching the regex to the return list
                        foreach (var foundData in data.Data)
                        {
                            _results.Add(foundData);
                        }
                    }
                }
                catch (WebException e)
                {
                    _errors.Add(e);
                }
            }

            if (!_aThreadIsComplete)
            {
                _aThreadIsComplete = true;
            }

            worker.DoneEvent.Set();
        }