public static async Task <IEnumerable <string> > GetDisallowedUrls(IWebAgent webAgent, string domain) { var uri = new Uri($"http://{domain}/robots.txt"); var list = new List <string>(); var text = ""; using (var response = await webAgent.ExecuteRequest(uri)) { if ((int)response.StatusCode >= 400 || (int)response.StatusCode <= 599) { return(list); } using (var stream = webAgent.GetCompressedStream(response)) using (var reader = new StreamReader(stream, Encoding.Default)) { text = reader.ReadToEnd(); } } var lines = text.ToLower().Split(Environment.NewLine.ToCharArray(), StringSplitOptions.RemoveEmptyEntries); var name = webAgent.AgentName.ToLower(); var applicable = false; foreach (var line in lines) { if (line.Contains("user-agent")) { applicable = line.Contains("*") || line.Contains(name); } if (line.Contains("disallow") && applicable) { var split = line.Split(':'); if (split.Length < 2) { continue; } var rule = split[1].Trim(); list.Add(rule); } } return(list); }
private async Task ThreadAction(IWorker worker, CrawlJob job) { // sort out multi threading holding pattern if (worker.Id != 0) { while (_queue.Count < (worker.Id + 1) && !_cancelSource.Token.IsCancellationRequested && !_aThreadIsComplete) { Thread.Sleep(100); } } while (job.CompletionConditions.All(cc => !cc.ConditionMet(GetCrawlProgress())) && !_cancelSource.Token.IsCancellationRequested && !_aThreadIsComplete) { if (worker.Id == 0 && NeedsUpdate()) { _updateAction(GetCrawlProgress()); } // set up fallback and retry policies var fallback = Policy <Uri> .Handle <CrawlQueueEmptyException>() .Fallback((cToken) => { _aThreadIsComplete = true; return(null); }); var retry = Policy <Uri> .Handle <CrawlQueueEmptyException>() .WaitAndRetry(10, tryNum => TimeSpan.FromMilliseconds(tryNum * 200)); // will attempt to get a new item from the queue, retrying as per above policies var next = Policy.Wrap(fallback, retry).Execute(() => { var n = GetNext(); if (n == null) { throw new CrawlQueueEmptyException(); } return(n); }); // fallback will set this if we failed to get a new link (this will end the crawl) if (_aThreadIsComplete) { continue; } try { // access it var responseTask = _webAgent.ExecuteRequest(next); // log that we've crawled it _crawled.Add(next); var response = await responseTask; if (response != null) { var html = HTMLRetriever.GetHTML(_webAgent.GetCompressedStream(response)); // parse the contents for new links and data user wants var data = DataExtractor.Extract(html, job.Domain, job.Regex); // add each of the links extracted if: // the queue is not too large // the link is not disallowed by the domain's robots.txt file // the link is not already in the queue // the link has not already been crawled // each of the user defined enqueue conditions returns true foreach (var link in data.Links) { if (_queue.Count < QUEUE_MAX && RobotParser.UriIsAllowed(_disallowedUrls, link) && !_queue.Contains(link) && !_crawled.Contains(link) && job.EnqueueConditions.All(ec => ec.ConditionMet(link))) { _queue.Enqueue(link); } } // add data matching the regex to the return list foreach (var foundData in data.Data) { _results.Add(foundData); } } } catch (WebException e) { _errors.Add(e); } } if (!_aThreadIsComplete) { _aThreadIsComplete = true; } worker.DoneEvent.Set(); }