Пример #1
0
        private async Task RunAsync(CancellationToken cancellationToken)
        {
            CloudStorageAccount storageAccount = CloudStorageAccount.Parse(
                ConfigurationManager.AppSettings["StorageConnectionString"]
                );
            CloudQueueClient queueClient = storageAccount.CreateCloudQueueClient();

            CloudQueue commandQueue = queueClient.GetQueueReference(CommandMessage.QUEUE_COMMAND);

            commandQueue.CreateIfNotExists();

            CloudQueue urlQueue = queueClient.GetQueueReference(UrlMessage.QUEUE_URL);

            commandQueue.CreateIfNotExists();

            while (!cancellationToken.IsCancellationRequested)
            {
                CloudQueueMessage commandMessage = commandQueue.GetMessage(TimeSpan.FromMinutes(5));
                if (commandMessage != null)
                {
                    if (commandMessage.AsString == CommandMessage.COMMAND_LOAD)
                    {
                        workerStateMachine.setState(WorkerStateMachine.STATE_LOADING);
                        webLoader = new WebLoader();
                    }
                    else if (commandMessage.AsString == CommandMessage.COMMAND_IDLE)
                    {
                        workerStateMachine.setState(WorkerStateMachine.STATE_IDLE);
                    }
                    else if (commandMessage.AsString == CommandMessage.COMMAND_CRAWL)
                    {
                        workerStateMachine.setState(WorkerStateMachine.STATE_CRAWLING);
                        webCrawler = new WebCrawler(statsManager);
                    }
                    commandQueue.DeleteMessage(commandMessage);
                }

                if (workerStateMachine.getState() != WorkerStateMachine.STATE_IDLE) // in a loading or crawling state
                {
                    CloudQueueMessage urlMessage = urlQueue.GetMessage();
                    if (urlMessage != null) // got url from queue of sitemap or urlset
                    {
                        // load or crawl with UrlEntity depending on current state
                        UrlMessage urlEntity     = UrlMessage.Parse(urlMessage.AsString);
                        bool       deleteMessage = workerStateMachine.Act(urlEntity);
                        if (deleteMessage)
                        {
                            urlQueue.DeleteMessage(urlMessage);
                        }
                    }
                    else
                    {
                        workerStateMachine.Act(null); // need to call Act(null) to finish crawling one day
                    }
                }

                await Task.Delay(100);
            }
        }
Пример #2
0
        public override void Run()
        {
            CloudQueue    LoadQueue           = CloudConfiguration.GetLoadingQueue();
            CloudQueue    CrawlQueue          = CloudConfiguration.GetCrawlingQueue();
            CloudQueue    StopQueue           = CloudConfiguration.GetStopQueue();
            CloudTable    Table               = CloudConfiguration.GetTable();
            List <string> CNNRules            = ProcessRobots("http://www.cnn.com/robots.txt");
            List <string> BleacherReportRules = ProcessRobots("http://www.bleacherreport.com/robots.txt");
            WebCrawler    Crawler             = new WebCrawler(CNNRules, BleacherReportRules);

            State = "Idle";
            Thread.Sleep(10000);

            CloudQueueMessage stopMessage = StopQueue.GetMessage();

            CPUCount = new PerformanceCounter("Processor", "% Processor Time", "_Total");
            MemCount = new PerformanceCounter("Memory", "Available MBytes");

            while (true)
            {
                while (stopMessage == null)
                {
                    // Get the next message
                    CloudQueueMessage loadMessage = LoadQueue.GetMessage();

                    if (loadMessage != null)
                    {
                        State = "Loading";
                        string message = loadMessage.AsString;
                        Crawler.ProcessURL(message);
                        LoadQueue.DeleteMessage(loadMessage);
                    }
                    else if (State.Equals("Loading") || State.Equals("Crawling"))
                    {
                        CloudQueueMessage crawlMessage = CrawlQueue.GetMessage();
                        // dequeue crawl message
                        if (crawlMessage != null)
                        {
                            State = "Crawling";
                            Crawler.ProcessURL(crawlMessage.AsString);
                            CrawlQueue.DeleteMessage(crawlMessage);
                        }
                    }
                    stopMessage = StopQueue.GetMessage();
                }
                State = "Idle";
            }
        }
Пример #3
0
        public override bool OnStart()
        {
            // Set the maximum number of concurrent connections
            ServicePointManager.DefaultConnectionLimit = 12;

            // For information on handling configuration changes
            // see the MSDN topic at https://go.microsoft.com/fwlink/?LinkId=166357.

            bool result = base.OnStart();

            Trace.TraceInformation("WorkerRole1 has been started");
            int    instanceID;
            string instanceId = RoleEnvironment.CurrentRoleInstance.Id;

            if (int.TryParse(instanceId.Substring(instanceId.LastIndexOf(".") + 1), out instanceID)) // On cloud.
            {
                int.TryParse(instanceId.Substring(instanceId.LastIndexOf("_") + 1), out instanceID); // On compute emulator.
            }

            webCrawler         = new WebCrawler(statsManager);
            workerStateMachine = new WorkerStateMachine(instanceID.ToString());
            return(result);
        }
Пример #4
0
        public override void Run()
        {
            Storage = new AzureStorage();

            LoadQueue        = CloudConfiguration.GetLoadingQueue();
            CrawlQueue       = CloudConfiguration.GetCrawlingQueue();
            StopQueue        = CloudConfiguration.GetStopQueue();
            SiteDataTable    = CloudConfiguration.GetSiteDataTable();
            AdminStatusTable = CloudConfiguration.GetAdminStatusTable();
            StateQueue       = CloudConfiguration.GetStateQueue();

            State = "Idle";

            CPUCount = new PerformanceCounter("Processor", "% Processor Time", "_Total");
            MemCount = new PerformanceCounter("Memory", "Available MBytes");

            Status = new AdminStatus(State, (int)CPUCount.NextValue(), (int)MemCount.NextValue());

            string[] robots = { "http://www.cnn.com/robots.txt", "http://www.bleacherreport.com/robots.txt" };
            Crawler = new WebCrawler(robots, Storage);

            Thread.Sleep(10000);



            string url = "";

            while (true)
            {
                CloudQueueMessage stopMessage = StopQueue.GetMessage();

                while (stopMessage == null)
                {
                    // Get the next message
                    CloudQueueMessage loadMessage = LoadQueue.GetMessage();
                    State = "Loading";
                    if (loadMessage != null)
                    {
                        State = "Loading";
                        url   = loadMessage.AsString;
                        if (url.Contains("robots.txt"))
                        {
                            string[] robotLinks = url.Split(null);
                            foreach (string link in robotLinks)
                            {
                                Crawler.ProcessURL(link);
                            }
                            LoadQueue.DeleteMessage(loadMessage);
                        }
                        else
                        {
                            Crawler.ProcessURL(url);
                        }
                    }
                    else if (State.Equals("Loading") || State.Equals("Crawling"))
                    {
                        CloudQueueMessage crawlMessage = CrawlQueue.GetMessage();
                        // dequeue crawl message
                        if (crawlMessage != null)
                        {
                            State = "Crawling";
                            url   = crawlMessage.AsString;
                            Crawler.ProcessURL(url);
                            CrawlQueue.DeleteMessage(crawlMessage);
                        }
                    }
                    stopMessage = StopQueue.GetMessage();
                    UpdateDashboard(url);
                }
                State = "Idle";
            }
        }