Ejemplo n.º 1
0
        public async Task Disallows_URIs_That_Are_Relevant_To_All_User_Agents()
        {
            var(client, mockMessageHandler) = GetMockableClient();
            var          parser     = new RobotParser(client);
            var          testUri    = new Uri("http://source.com/forbidden-path/resource.html");
            const string USER_AGENT = "USER AGENT";

            mockMessageHandler
            .Protected()
            .Setup <Task <HttpResponseMessage> >(
                "SendAsync",
                ItExpr.IsAny <HttpRequestMessage>(),
                ItExpr.IsAny <CancellationToken>())
            .ReturnsAsync(new HttpResponseMessage()
            {
                StatusCode = HttpStatusCode.OK,
                Content    = new StringContent(@"
                        User-Agent: *
                        Disallow: /forbidden-path/
                    ")
            });

            var result = await parser.UriForbidden(testUri, USER_AGENT);

            Assert.IsTrue(result);
        }
Ejemplo n.º 2
0
        public async Task Allows_URIs_That_Match_A_Fobidden_Path_For_A_User_Agent_That_Is_Not_The_Crawlers()
        {
            var(client, mockMessageHandler) = GetMockableClient();
            var          parser     = new RobotParser(client);
            var          testUri    = new Uri("http://source.com/forbidden-for-some/not-for-others");
            const string USER_AGENT = "USER AGENT";

            mockMessageHandler
            .Protected()
            .Setup <Task <HttpResponseMessage> >(
                "SendAsync",
                ItExpr.IsAny <HttpRequestMessage>(),
                ItExpr.IsAny <CancellationToken>())
            .ReturnsAsync(new HttpResponseMessage()
            {
                StatusCode = HttpStatusCode.OK,
                Content    = new StringContent(@"
                        User-Agent: some user agent
                        Disallow: /forbidden-for-some/
                    ")
            });

            var result = await parser.UriForbidden(testUri, USER_AGENT);

            Assert.IsFalse(result);
        }
Ejemplo n.º 3
0
        public async Task Makes_Request_To_URIs_Host_Robots_File_If_Nothing_Found_In_Cache()
        {
            var(client, mockMessageHandler) = GetMockableClient();
            var          parser     = new RobotParser(client);
            var          testUri    = new Uri("http://source.com/forbidden-path/resource.html");
            const string USER_AGENT = "USER AGENT";

            mockMessageHandler
            .Protected()
            .Setup <Task <HttpResponseMessage> >(
                "SendAsync",
                ItExpr.IsAny <HttpRequestMessage>(),
                ItExpr.IsAny <CancellationToken>())
            .ReturnsAsync(new HttpResponseMessage()
            {
                StatusCode = HttpStatusCode.OK,
                Content    = new StringContent(@"
                        User-Agent: user agent
                        Disallow: /forbidden-path/
                    ")
            });

            await parser.UriForbidden(testUri, USER_AGENT);

            mockMessageHandler
            .Protected()
            .Verify(
                "SendAsync",
                Times.Once(),
                ItExpr.IsAny <HttpRequestMessage>(),
                ItExpr.IsAny <CancellationToken>());
        }
Ejemplo n.º 4
0
        public async Task Handles_A_Null_User_Agent()
        {
            var(client, mockMessageHandler) = GetMockableClient();
            var parser  = new RobotParser(client);
            var testUri = new Uri("http://source.com/forbidden-path/something");

            mockMessageHandler
            .Protected()
            .Setup <Task <HttpResponseMessage> >(
                "SendAsync",
                ItExpr.IsAny <HttpRequestMessage>(),
                ItExpr.IsAny <CancellationToken>())
            .ReturnsAsync(new HttpResponseMessage()
            {
                StatusCode = HttpStatusCode.OK,
                Content    = new StringContent(@"
                        User-Agent: *
                        Disallow: /forbidden-path/
                    ")
            });

            var result = await parser.UriForbidden(testUri, null);

            Assert.IsTrue(result);
        }
Ejemplo n.º 5
0
        public async Task <CrawlReport> Crawl(CrawlJob job)
        {
            if (job == null)
            {
                throw new ArgumentNullException(nameof(job));
            }

            IsRunning = true;

            try
            {
                Setup(job.SeedUris, job.Cookies);

                // try and parse the robots.txt file of the domain and add any disallowed links to a read only collection
                _disallowedUrls = await RobotParser.GetDisallowedUrls(_webAgent, job.Domain.Host);

                // quit early as we are not allowed to go on this domain
                if (_disallowedUrls.Contains("/"))
                {
                    return(GetCrawlReport());
                }

                // create the allowed amount of threads for the job
                var threadsAndDoneEvents = CreateThreads(job);

                _startTime  = DateTime.Now;
                _lastUpdate = _startTime;

                // hold all but one thread in a pattern until there is work for them
                // start the first thread off, with the job of parsing the domain page provided
                foreach (var thread in threadsAndDoneEvents.Item1)
                {
                    thread.Start();
                }

                // wait for done events
                WaitHandle.WaitAll(threadsAndDoneEvents.Item2.ToArray());

                // flush queues and return the list of data found during the crawl
                foreach (var thread in threadsAndDoneEvents.Item1)
                {
                    if (thread.ThreadState == ThreadState.Running)
                    {
                        thread.Join();
                    }
                }

                return(GetCrawlReport());
            }
            catch (Exception e)
            {
                throw new YACException("Exception thrown from YAC", e);
            }
            finally
            {
                IsRunning = false;
            }
        }
Ejemplo n.º 6
0
    /// <summary>
    /// Handles putting together a robot GameObject. Overwrites existing prefabs.
    /// </summary>
    /// <param name="robotDescription">Contents of URDF robot description from robot or file</param>
    /// /// <returns>Gameobject representation of robot generated from description</returns>
    public static GameObject GenerateRobotGameObjectFromDescription(string robotDescription)
    {
#if UNITY_EDITOR
        XmlDocument xmlDoc = new XmlDocument();
        RobotParser parser = new RobotParser();

        xmlDoc.Load(XmlReader.Create(new StringReader(robotDescription)));
        Robot robot = parser.Parse(xmlDoc.DocumentElement);

        return(GenerateRobotGameObject(robot));
#endif
#if UNITY_STANDALONE
        Debug.LogError("Script can only be run from editor");
        return(null);
#endif
    }
Ejemplo n.º 7
0
        private async Task ThreadAction(IWorker worker, CrawlJob job)
        {
            // sort out multi threading holding pattern
            if (worker.Id != 0)
            {
                while (_queue.Count < (worker.Id + 1) && !_cancelSource.Token.IsCancellationRequested && !_aThreadIsComplete)
                {
                    Thread.Sleep(100);
                }
            }

            while (job.CompletionConditions.All(cc => !cc.ConditionMet(GetCrawlProgress())) &&
                   !_cancelSource.Token.IsCancellationRequested &&
                   !_aThreadIsComplete)
            {
                if (worker.Id == 0 && NeedsUpdate())
                {
                    _updateAction(GetCrawlProgress());
                }

                // set up fallback and retry policies
                var fallback = Policy <Uri> .Handle <CrawlQueueEmptyException>()
                               .Fallback((cToken) =>
                {
                    _aThreadIsComplete = true;
                    return(null);
                });

                var retry = Policy <Uri> .Handle <CrawlQueueEmptyException>()
                            .WaitAndRetry(10, tryNum => TimeSpan.FromMilliseconds(tryNum * 200));

                // will attempt to get a new item from the queue, retrying as per above policies
                var next = Policy.Wrap(fallback, retry).Execute(() =>
                {
                    var n = GetNext();

                    if (n == null)
                    {
                        throw new CrawlQueueEmptyException();
                    }

                    return(n);
                });

                // fallback will set this if we failed to get a new link (this will end the crawl)
                if (_aThreadIsComplete)
                {
                    continue;
                }

                try
                {
                    // access it
                    var responseTask = _webAgent.ExecuteRequest(next);

                    // log that we've crawled it
                    _crawled.Add(next);

                    var response = await responseTask;

                    if (response != null)
                    {
                        var html = HTMLRetriever.GetHTML(_webAgent.GetCompressedStream(response));

                        // parse the contents for new links and data user wants
                        var data = DataExtractor.Extract(html, job.Domain, job.Regex);

                        // add each of the links extracted if:
                        // the queue is not too large
                        // the link is not disallowed by the domain's robots.txt file
                        // the link is not already in the queue
                        // the link has not already been crawled
                        // each of the user defined enqueue conditions returns true
                        foreach (var link in data.Links)
                        {
                            if (_queue.Count < QUEUE_MAX &&
                                RobotParser.UriIsAllowed(_disallowedUrls, link) &&
                                !_queue.Contains(link) &&
                                !_crawled.Contains(link) &&
                                job.EnqueueConditions.All(ec => ec.ConditionMet(link)))
                            {
                                _queue.Enqueue(link);
                            }
                        }

                        // add data matching the regex to the return list
                        foreach (var foundData in data.Data)
                        {
                            _results.Add(foundData);
                        }
                    }
                }
                catch (WebException e)
                {
                    _errors.Add(e);
                }
            }

            if (!_aThreadIsComplete)
            {
                _aThreadIsComplete = true;
            }

            worker.DoneEvent.Set();
        }
Ejemplo n.º 8
0
        public override void Run()
        {
            RobotParser parser = new RobotParser();

            HtmlCrawler htmlCrawlerCNN = new HtmlCrawler(new HashSet <string>());

            HtmlCrawler htmlCrawlerNBA = new HtmlCrawler(new HashSet <string>());

            bool loading = false;

            bool crawling = false;

            bool idle = true;

            Trace.TraceInformation("WorkerRole1 is running");

            while (true)
            {
                Thread.Sleep(50);
                string status = "";
                if (idle == true)
                {
                    status = "Idle";
                }
                else if (crawling == true)
                {
                    status = "Crawling";
                }
                else if (loading == true)
                {
                    status = "Loading";
                }

                //add performance with no changes in queue size, index size, or number crawled
                var crawled   = 0;
                var sizeQueue = 0;
                var sizeIndex = 0;
                TableQuery <Performance> query3 = new TableQuery <Performance>()
                                                  .Take(1);

                foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(query3))
                {
                    crawled   = item.NumCrawled;
                    sizeQueue = item.SizeQueue;
                    sizeIndex = item.SizeIndex;
                }

                Performance.insertPerformance(status, crawled, sizeQueue, sizeIndex);


                //Handle Command Queue
                CloudQueueMessage commandMessage = StorageManager.getCommandQueue().GetMessage(TimeSpan.FromMinutes(5));

                //In the case there is no more Urls to crawl, or at the beginning, this command message will be called
                if (commandMessage != null)
                {
                    StorageManager.getCommandQueue().DeleteMessage(commandMessage);

                    //command message is stop
                    if (commandMessage.AsString == "stop")
                    {
                        //clear queue and table
                        StorageManager.deleteAllQueues();
                        StorageManager.deleteTables();
                        //reset parser and crawler
                        parser = new RobotParser("");
                        htmlCrawlerCNN.crawlable = false;
                        htmlCrawlerCNN.Visited   = new HashSet <string>();
                        htmlCrawlerCNN.Disallow  = new HashSet <string>();

                        htmlCrawlerNBA.crawlable = false;
                        htmlCrawlerNBA.Visited   = new HashSet <string>();
                        htmlCrawlerNBA.Disallow  = new HashSet <string>();

                        loading  = false;
                        crawling = false;
                        idle     = true;

                        //add performance, clear queue sizes

                        Performance.insertPerformance("Idle", 0, 0, 0);
                    }

                    //command message is start
                    if (commandMessage.AsString.StartsWith("start:"))
                    {
                        crawling = false;
                        idle     = false;
                        loading  = true;

                        //add performance with no changes in queue size, index size, or number crawled
                        TableQuery <Performance> queryStart = new TableQuery <Performance>()
                                                              .Take(1);

                        foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(queryStart))
                        {
                            crawled   = item.NumCrawled;
                            sizeQueue = item.SizeQueue;
                            sizeIndex = item.SizeIndex;
                        }

                        Performance.insertPerformance("Loading", crawled, sizeQueue, sizeIndex);


                        ServicePointManager.Expect100Continue = true;
                        ServicePointManager.SecurityProtocol  = SecurityProtocolType.Tls12;

                        var robotFile = commandMessage.AsString.Substring(6);

                        string contents;
                        using (var wc = new System.Net.WebClient())
                        {
                            contents = wc.DownloadString(robotFile);
                        }

                        //create and parse through robots.txt
                        parser = new RobotParser(contents);

                        foreach (string filepath in parser.XMLFiles)
                        {
                            //only XMLs from cnn and nba
                            if (filepath.Contains("cnn") || filepath.Contains("nba"))
                            {
                                CloudQueueMessage filepathMessage = new CloudQueueMessage(filepath);
                                StorageManager.getXMLQueue().AddMessage(filepathMessage);
                            }
                        }

                        if (robotFile.Contains("cnn"))
                        {
                            htmlCrawlerCNN = new HtmlCrawler(parser.Disallow);
                        }

                        if (robotFile.Contains("bleacherreport"))
                        {
                            htmlCrawlerNBA = new HtmlCrawler(parser.Disallow);
                        }
                        //set the crawler with the disallows

                        Performance.insertPerformance("Idle", crawled, sizeQueue, sizeIndex);
                    }
                }


                //Handle XML Queue
                CloudQueueMessage XML = StorageManager.getXMLQueue().GetMessage(TimeSpan.FromMinutes(5));
                while (XML != null)
                {
                    if (XML.AsString.Contains("cnn.com"))
                    {
                        htmlCrawlerCNN.readXMLUrl(XML.AsString);
                    }
                    if (XML.AsString.Contains("bleacherreport.com"))
                    {
                        htmlCrawlerNBA.readXMLUrl(XML.AsString);
                    }

                    StorageManager.getXMLQueue().DeleteMessage(XML);
                    XML = StorageManager.getXMLQueue().GetMessage(TimeSpan.FromMinutes(5));
                }

                //Handle HTML Queue
                CloudQueueMessage HTML = StorageManager.getUrlQueue().GetMessage(TimeSpan.FromMinutes(5));
                if (HTML != null)
                {
                    //handle performance
                    if (htmlCrawlerCNN.crawlable || htmlCrawlerNBA.crawlable)
                    {
                        idle     = false;
                        loading  = false;
                        crawling = true;

                        //add performance, reduce queue size

                        TableQuery <Performance> queryCNN = new TableQuery <Performance>()
                                                            .Take(1);

                        foreach (Performance item in StorageManager.getPerformanceTable().ExecuteQuery(query3))
                        {
                            crawled   = item.NumCrawled;
                            sizeQueue = item.SizeQueue - 1;
                            sizeIndex = item.SizeIndex;
                        }

                        Performance.insertPerformance("Crawling", crawled, sizeQueue, sizeIndex);
                    }
                    //handles if it is a cnn article
                    if (htmlCrawlerCNN.crawlable == true && HTML.AsString.Contains("cnn.com"))
                    {
                        htmlCrawlerCNN.parseHTML(HTML.AsString);
                        StorageManager.getUrlQueue().DeleteMessage(HTML);
                    }
                    //handles if it is a bleacher report article
                    else if (htmlCrawlerNBA.crawlable == true && HTML.AsString.Contains("bleacherreport.com"))
                    {
                        htmlCrawlerNBA.parseHTML(HTML.AsString);
                        StorageManager.getUrlQueue().DeleteMessage(HTML);
                    }
                }
            }
        }