public async Task Disallows_URIs_That_Are_Relevant_To_All_User_Agents() { var(client, mockMessageHandler) = GetMockableClient(); var parser = new RobotParser(client); var testUri = new Uri("http://source.com/forbidden-path/resource.html"); const string USER_AGENT = "USER AGENT"; mockMessageHandler .Protected() .Setup <Task <HttpResponseMessage> >( "SendAsync", ItExpr.IsAny <HttpRequestMessage>(), ItExpr.IsAny <CancellationToken>()) .ReturnsAsync(new HttpResponseMessage() { StatusCode = HttpStatusCode.OK, Content = new StringContent(@" User-Agent: * Disallow: /forbidden-path/ ") }); var result = await parser.UriForbidden(testUri, USER_AGENT); Assert.IsTrue(result); }
public async Task Allows_URIs_That_Match_A_Fobidden_Path_For_A_User_Agent_That_Is_Not_The_Crawlers() { var(client, mockMessageHandler) = GetMockableClient(); var parser = new RobotParser(client); var testUri = new Uri("http://source.com/forbidden-for-some/not-for-others"); const string USER_AGENT = "USER AGENT"; mockMessageHandler .Protected() .Setup <Task <HttpResponseMessage> >( "SendAsync", ItExpr.IsAny <HttpRequestMessage>(), ItExpr.IsAny <CancellationToken>()) .ReturnsAsync(new HttpResponseMessage() { StatusCode = HttpStatusCode.OK, Content = new StringContent(@" User-Agent: some user agent Disallow: /forbidden-for-some/ ") }); var result = await parser.UriForbidden(testUri, USER_AGENT); Assert.IsFalse(result); }
public async Task Makes_Request_To_URIs_Host_Robots_File_If_Nothing_Found_In_Cache() { var(client, mockMessageHandler) = GetMockableClient(); var parser = new RobotParser(client); var testUri = new Uri("http://source.com/forbidden-path/resource.html"); const string USER_AGENT = "USER AGENT"; mockMessageHandler .Protected() .Setup <Task <HttpResponseMessage> >( "SendAsync", ItExpr.IsAny <HttpRequestMessage>(), ItExpr.IsAny <CancellationToken>()) .ReturnsAsync(new HttpResponseMessage() { StatusCode = HttpStatusCode.OK, Content = new StringContent(@" User-Agent: user agent Disallow: /forbidden-path/ ") }); await parser.UriForbidden(testUri, USER_AGENT); mockMessageHandler .Protected() .Verify( "SendAsync", Times.Once(), ItExpr.IsAny <HttpRequestMessage>(), ItExpr.IsAny <CancellationToken>()); }
public async Task Handles_A_Null_User_Agent() { var(client, mockMessageHandler) = GetMockableClient(); var parser = new RobotParser(client); var testUri = new Uri("http://source.com/forbidden-path/something"); mockMessageHandler .Protected() .Setup <Task <HttpResponseMessage> >( "SendAsync", ItExpr.IsAny <HttpRequestMessage>(), ItExpr.IsAny <CancellationToken>()) .ReturnsAsync(new HttpResponseMessage() { StatusCode = HttpStatusCode.OK, Content = new StringContent(@" User-Agent: * Disallow: /forbidden-path/ ") }); var result = await parser.UriForbidden(testUri, null); Assert.IsTrue(result); }
public async Task <CrawlReport> Crawl(CrawlJob job) { if (job == null) { throw new ArgumentNullException(nameof(job)); } IsRunning = true; try { Setup(job.SeedUris, job.Cookies); // try and parse the robots.txt file of the domain and add any disallowed links to a read only collection _disallowedUrls = await RobotParser.GetDisallowedUrls(_webAgent, job.Domain.Host); // quit early as we are not allowed to go on this domain if (_disallowedUrls.Contains("/")) { return(GetCrawlReport()); } // create the allowed amount of threads for the job var threadsAndDoneEvents = CreateThreads(job); _startTime = DateTime.Now; _lastUpdate = _startTime; // hold all but one thread in a pattern until there is work for them // start the first thread off, with the job of parsing the domain page provided foreach (var thread in threadsAndDoneEvents.Item1) { thread.Start(); } // wait for done events WaitHandle.WaitAll(threadsAndDoneEvents.Item2.ToArray()); // flush queues and return the list of data found during the crawl foreach (var thread in threadsAndDoneEvents.Item1) { if (thread.ThreadState == ThreadState.Running) { thread.Join(); } } return(GetCrawlReport()); } catch (Exception e) { throw new YACException("Exception thrown from YAC", e); } finally { IsRunning = false; } }
/// <summary> /// Handles putting together a robot GameObject. Overwrites existing prefabs. /// </summary> /// <param name="robotDescription">Contents of URDF robot description from robot or file</param> /// /// <returns>Gameobject representation of robot generated from description</returns> public static GameObject GenerateRobotGameObjectFromDescription(string robotDescription) { #if UNITY_EDITOR XmlDocument xmlDoc = new XmlDocument(); RobotParser parser = new RobotParser(); xmlDoc.Load(XmlReader.Create(new StringReader(robotDescription))); Robot robot = parser.Parse(xmlDoc.DocumentElement); return(GenerateRobotGameObject(robot)); #endif #if UNITY_STANDALONE Debug.LogError("Script can only be run from editor"); return(null); #endif }
private async Task ThreadAction(IWorker worker, CrawlJob job) { // sort out multi threading holding pattern if (worker.Id != 0) { while (_queue.Count < (worker.Id + 1) && !_cancelSource.Token.IsCancellationRequested && !_aThreadIsComplete) { Thread.Sleep(100); } } while (job.CompletionConditions.All(cc => !cc.ConditionMet(GetCrawlProgress())) && !_cancelSource.Token.IsCancellationRequested && !_aThreadIsComplete) { if (worker.Id == 0 && NeedsUpdate()) { _updateAction(GetCrawlProgress()); } // set up fallback and retry policies var fallback = Policy <Uri> .Handle <CrawlQueueEmptyException>() .Fallback((cToken) => { _aThreadIsComplete = true; return(null); }); var retry = Policy <Uri> .Handle <CrawlQueueEmptyException>() .WaitAndRetry(10, tryNum => TimeSpan.FromMilliseconds(tryNum * 200)); // will attempt to get a new item from the queue, retrying as per above policies var next = Policy.Wrap(fallback, retry).Execute(() => { var n = GetNext(); if (n == null) { throw new CrawlQueueEmptyException(); } return(n); }); // fallback will set this if we failed to get a new link (this will end the crawl) if (_aThreadIsComplete) { continue; } try { // access it var responseTask = _webAgent.ExecuteRequest(next); // log that we've crawled it _crawled.Add(next); var response = await responseTask; if (response != null) { var html = HTMLRetriever.GetHTML(_webAgent.GetCompressedStream(response)); // parse the contents for new links and data user wants var data = DataExtractor.Extract(html, job.Domain, job.Regex); // add each of the links extracted if: // the queue is not too large // the link is not disallowed by the domain's robots.txt file // the link is not already in the queue // the link has not already been crawled // each of the user defined enqueue conditions returns true foreach (var link in data.Links) { if (_queue.Count < QUEUE_MAX && RobotParser.UriIsAllowed(_disallowedUrls, link) && !_queue.Contains(link) && !_crawled.Contains(link) && job.EnqueueConditions.All(ec => ec.ConditionMet(link))) { _queue.Enqueue(link); } } // add data matching the regex to the return list foreach (var foundData in data.Data) { _results.Add(foundData); } } } catch (WebException e) { _errors.Add(e); } } if (!_aThreadIsComplete) { _aThreadIsComplete = true; } worker.DoneEvent.Set(); }
public override void Run() { RobotParser parser = new RobotParser(); HtmlCrawler htmlCrawlerCNN = new HtmlCrawler(new HashSet <string>()); HtmlCrawler htmlCrawlerNBA = new HtmlCrawler(new HashSet <string>()); bool loading = false; bool crawling = false; bool idle = true; Trace.TraceInformation("WorkerRole1 is running"); while (true) { Thread.Sleep(50); string status = ""; if (idle == true) { status = "Idle"; } else if (crawling == true) { status = "Crawling"; } else if (loading == true) { status = "Loading"; } //add performance with no changes in queue size, index size, or number crawled var crawled = 0; var sizeQueue = 0; var sizeIndex = 0; TableQuery <Performance> query3 = new TableQuery <Performance>() .Take(1); foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(query3)) { crawled = item.NumCrawled; sizeQueue = item.SizeQueue; sizeIndex = item.SizeIndex; } Performance.insertPerformance(status, crawled, sizeQueue, sizeIndex); //Handle Command Queue CloudQueueMessage commandMessage = StorageManager.getCommandQueue().GetMessage(TimeSpan.FromMinutes(5)); //In the case there is no more Urls to crawl, or at the beginning, this command message will be called if (commandMessage != null) { StorageManager.getCommandQueue().DeleteMessage(commandMessage); //command message is stop if (commandMessage.AsString == "stop") { //clear queue and table StorageManager.deleteAllQueues(); StorageManager.deleteTables(); //reset parser and crawler parser = new RobotParser(""); htmlCrawlerCNN.crawlable = false; htmlCrawlerCNN.Visited = new HashSet <string>(); htmlCrawlerCNN.Disallow = new HashSet <string>(); htmlCrawlerNBA.crawlable = false; htmlCrawlerNBA.Visited = new HashSet <string>(); htmlCrawlerNBA.Disallow = new HashSet <string>(); loading = false; crawling = false; idle = true; //add performance, clear queue sizes Performance.insertPerformance("Idle", 0, 0, 0); } //command message is start if (commandMessage.AsString.StartsWith("start:")) { crawling = false; idle = false; loading = true; //add performance with no changes in queue size, index size, or number crawled TableQuery <Performance> queryStart = new TableQuery <Performance>() .Take(1); foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(queryStart)) { crawled = item.NumCrawled; sizeQueue = item.SizeQueue; sizeIndex = item.SizeIndex; } Performance.insertPerformance("Loading", crawled, sizeQueue, sizeIndex); ServicePointManager.Expect100Continue = true; ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12; var robotFile = commandMessage.AsString.Substring(6); string contents; using (var wc = new System.Net.WebClient()) { contents = wc.DownloadString(robotFile); } //create and parse through robots.txt parser = new RobotParser(contents); foreach (string filepath in parser.XMLFiles) { //only XMLs from cnn and nba if (filepath.Contains("cnn") || filepath.Contains("nba")) { CloudQueueMessage filepathMessage = new CloudQueueMessage(filepath); StorageManager.getXMLQueue().AddMessage(filepathMessage); } } if (robotFile.Contains("cnn")) { htmlCrawlerCNN = new HtmlCrawler(parser.Disallow); } if (robotFile.Contains("bleacherreport")) { htmlCrawlerNBA = new HtmlCrawler(parser.Disallow); } //set the crawler with the disallows Performance.insertPerformance("Idle", crawled, sizeQueue, sizeIndex); } } //Handle XML Queue CloudQueueMessage XML = StorageManager.getXMLQueue().GetMessage(TimeSpan.FromMinutes(5)); while (XML != null) { if (XML.AsString.Contains("cnn.com")) { htmlCrawlerCNN.readXMLUrl(XML.AsString); } if (XML.AsString.Contains("bleacherreport.com")) { htmlCrawlerNBA.readXMLUrl(XML.AsString); } StorageManager.getXMLQueue().DeleteMessage(XML); XML = StorageManager.getXMLQueue().GetMessage(TimeSpan.FromMinutes(5)); } //Handle HTML Queue CloudQueueMessage HTML = StorageManager.getUrlQueue().GetMessage(TimeSpan.FromMinutes(5)); if (HTML != null) { //handle performance if (htmlCrawlerCNN.crawlable || htmlCrawlerNBA.crawlable) { idle = false; loading = false; crawling = true; //add performance, reduce queue size TableQuery <Performance> queryCNN = new TableQuery <Performance>() .Take(1); foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(query3)) { crawled = item.NumCrawled; sizeQueue = item.SizeQueue - 1; sizeIndex = item.SizeIndex; } Performance.insertPerformance("Crawling", crawled, sizeQueue, sizeIndex); } //handles if it is a cnn article if (htmlCrawlerCNN.crawlable == true && HTML.AsString.Contains("cnn.com")) { htmlCrawlerCNN.parseHTML(HTML.AsString); StorageManager.getUrlQueue().DeleteMessage(HTML); } //handles if it is a bleacher report article else if (htmlCrawlerNBA.crawlable == true && HTML.AsString.Contains("bleacherreport.com")) { htmlCrawlerNBA.parseHTML(HTML.AsString); StorageManager.getUrlQueue().DeleteMessage(HTML); } } } }