Exemplo n.º 1
0
        public override void Run()
        {
            Trace.TraceInformation("WorkerRole::Run(): started");

            webCrawler = new Crawler(
                Azure.GetInstance().getQueueReference(ConfigurationManager.AppSettings["urlQ"]),
                Azure.GetInstance().getTableReference(ConfigurationManager.AppSettings["pageInfo"]),
                Azure.GetInstance().getTableReference(ConfigurationManager.AppSettings["errorInfo"])
            );

            updateState(WorkerInfo.State.Stopped);
            while (true)
            {
                CMsg commandMsg = CommandQueue.GetInstance().getMessage();
                if (commandMsg != null)
                {
                    switch (commandMsg.command)
                    {
                        case Command.AddDomain:
                            webCrawler.addAuthority((new Urls(commandMsg.str).getAuthority()));
                            break;
                        case Command.Start:
                            updateState(WorkerInfo.State.Loading, true);
                            webCrawler.start();
                            break;
                        case Command.Stop:
                            updateState(WorkerInfo.State.Stopping, true);
                            webCrawler.clearAuthorities();
                            webCrawler.stop(true);
                            updateState(WorkerInfo.State.Stopped, true);
                            break;
                        case Command.Reset:
                            updateState(WorkerInfo.State.Stopping, true);
                            webCrawler.stop(true);
                            updateState(WorkerInfo.State.Resetting, true);
                            webCrawler.resetDatabase();
                            updateState(WorkerInfo.State.Reset, true);
                            break;
                    }
                    CommandQueue.GetInstance().removeMessage(commandMsg);
                }

                if (!webCrawler.stopped())
                {
                    updateState(WorkerInfo.State.Crawling);
                }
                WorkerMonitor.GetInstance().updateProfile(currentState);
            }
        }
        public override void Run()
        {
            CloudStorageAccount storageAccount = CloudStorageAccount.Parse(ConfigurationManager.AppSettings["StorageConnectionString"]);
            CloudTableClient tableClient = storageAccount.CreateCloudTableClient();
            datatable = tableClient.GetTableReference("datatable");
            datatable.CreateIfNotExists();

            CloudQueueClient queueClient = storageAccount.CreateCloudQueueClient();
            urlQueue = queueClient.GetQueueReference("urlqueue");
            urlQueue.CreateIfNotExists();
            adminQueue = queueClient.GetQueueReference("adminqueue");
            adminQueue.CreateIfNotExists();

            ramCounter = new System.Diagnostics.PerformanceCounter("Memory", "Available MBytes");
            cpuCounter = new System.Diagnostics.PerformanceCounter();
            cpuCounter.CategoryName = "Processor";
            cpuCounter.CounterName = "% Processor Time";
            cpuCounter.InstanceName = "_Total";

            crawler = new Crawler();

            while (true)
            {
                Thread.Sleep(50);

                //If there is something in the admin queue
                if (adminQueue.PeekMessage() != null)
                {
                    CloudQueueMessage message = adminQueue.GetMessage();
                    crawler.handleAdminMessage(message);
                    adminQueue.DeleteMessage(message);
                }
                if (urlQueue.PeekMessage() != null)
                {
                    crawler.crawlingPhase(urlQueue.GetMessage());
                }
            }
        }
Exemplo n.º 3
0
        public override void Run()
        {
            //put visited urls in a hash set
            startingCode();
            
            while (true)
            {
                Trace.TraceInformation("Working");

                //Sleep 50ms
                Thread.Sleep(50);
                
                //Check and handle admin messages
                CloudQueueMessage startStopMessage = startStopQueue.GetMessage();
                if (startStopMessage != null)
                {
                    state = startStopMessage.AsString;
                    Update("info", "state", state);
                    startStopQueue.DeleteMessage(startStopMessage);
                    if (state.Equals("started"))
                    {
                        crawl = new Crawler();
                        crawl.StartLoader();
                    }
                }

                if (state.Equals("clearing")){
                    //clear queue
                    queue.Clear();
                    startingCode();
                }
                else if (state.Equals("started")) //keepGoing
                {
                    //get message from url queue
                    CloudQueueMessage message = queue.GetMessage();
                    //if message isn't null
                    if (message != null)
                    {
                        urlsCrawled++;
                        Update("info", "total", urlsCrawled.ToString());
                        queue.FetchAttributes();
                        queueSize = (int)queue.ApproximateMessageCount;
                        Update("info", "queue", queueSize.ToString());
                        string url = message.AsString;
                        if (!acceptedURLs.Contains(url))
                        {
                            try
                            {
                                List<WebCrawlerEntity> entities = crawl.startCrawler(url); //Store dates
                                if (entities != null)
                                {
                                    numTitles++;
                                    Update("info", "numTitles", numTitles.ToString());
                                    string temp = "";
                                    foreach (WebCrawlerEntity w in entities)
                                    {
                                        temp += " " + w.PartitionKey;
                                        urlsAccepted++;
                                        w.num = urlsAccepted;
                                        Update("info", "accepted", urlsAccepted.ToString());
                                        TableOperation insertOperation = TableOperation.InsertOrReplace(w);
                                        table.ExecuteAsync(insertOperation);
                                        acceptedURLs.Add(url);
                                    }
                                    TextInfo myTI = new CultureInfo("en-US", false).TextInfo;
                                    Update("info", "lastTitle", myTI.ToTitleCase(temp));
                                }

                            }
                            catch (Exception e)
                            { //put errors in error table
                                numErrors++;
                                infoEntity newError = new infoEntity(numErrors.ToString(), "url: " + url + " Error: " + e.Message);
                                TableOperation insertErrorOperation = TableOperation.InsertOrReplace(newError);
                                errorTable.Execute(insertErrorOperation);
                            }          
                        }
                        queue.DeleteMessage(message);
                    }
                }
            }
        }
Exemplo n.º 4
0
        public override void Run()
        {
            Trace.TraceInformation("WorkerRole1 is running");
            CloudStorageAccount storageAccount = CloudStorageAccount.Parse(ConfigurationManager.AppSettings["StorageConnectionString"]);
            CloudTableClient    tableClient    = storageAccount.CreateCloudTableClient();
            CloudQueueClient    queueClient    = storageAccount.CreateCloudQueueClient();

            table       = tableClient.GetTableReference("htmlURLs");
            HTMLs       = queueClient.GetQueueReference("urls");
            messages    = queueClient.GetQueueReference("status");
            visitedURLs = queueClient.GetQueueReference("visited");
            bool robotsparsed        = false;
            CloudQueueMessage status = null;

            lastTen = new List <String>();
            while (true)
            {
                Thread.Sleep(10);
                if (messages.Exists())
                {
                    status = messages.PeekMessage();
                }
                if (status != null && status.AsString.Equals("Start") && HTMLs != null && !robotsparsed)
                {
                    spider = new Crawler();
                    List <string> urls  = spider.crawlRobots();
                    List <string> urls2 = spider.cnnRobotsCrawl();
                    index       = 0;
                    urlsCrawled = 0;
                    foreach (string url in urls)
                    {
                        HTMLs.AddMessage(new CloudQueueMessage(url));
                    }
                    foreach (string url in urls2)
                    {
                        HTMLs.AddMessage(new CloudQueueMessage(url));
                    }
                    robotsparsed = true;
                    messages.DeleteMessage(messages.GetMessage(TimeSpan.FromMinutes(5)));
                    messages.AddMessage(new CloudQueueMessage("Crawling"));
                }
                else if (status != null && !status.AsString.Equals("Stop") && HTMLs != null && robotsparsed)
                {
                    CloudQueueMessage message = HTMLs.GetMessage(TimeSpan.FromMinutes(5));
                    if (message != null)
                    {
                        HTMLs.DeleteMessage(message);
                        string messageString = message.AsString;
                        if (spider.isAllowed(messageString))
                        {
                            List <string> newLinks = spider.crawlLink(messageString);
                            URL           entry    = null;
                            if (newLinks[0] == "Error")
                            {
                                entry = new URL(messageString, newLinks[0], "Error");
                            }
                            else
                            {
                                entry = new URL(messageString, newLinks[0], "Partition");
                            }
                            if (lastTen.Count < 10)
                            {
                                lastTen.Add(messageString);
                            }
                            else
                            {
                                lastTen.Remove(lastTen[0]);
                                lastTen.Add(messageString);
                            }
                            if (spider.visited().Contains(messageString))
                            {
                                index++;
                            }
                            urlsCrawled++;
                            TableOperation insertOperation = TableOperation.InsertOrReplace(entry);
                            table.ExecuteAsync(insertOperation);
                            //var result = table.BeginExecute(insertOperation,
                            //new AsyncCallback(onTableExecuteComplete), entity);
                            //result.AsyncWaitHandle.WaitOne();
                            if (newLinks.Count > 1)
                            {
                                for (int i = 2; i < newLinks.Count - 1; i++)
                                {
                                    HTMLs.AddMessageAsync(new CloudQueueMessage(newLinks[i]));
                                }
                            }
                        }
                        URL            tableEntry     = new URL("Index", index.ToString(), "IndexCount");
                        TableOperation indexOperation = TableOperation.InsertOrReplace(tableEntry);
                        table.Execute(indexOperation);
                        URL            tableEntry2       = new URL("urls", urlsCrawled.ToString(), "URLs");
                        TableOperation urlCountOperation = TableOperation.InsertOrReplace(tableEntry2);
                        table.Execute(urlCountOperation);
                    }
                }
            }
            //     try
            //       {
            //           this.RunAsync(this.cancellationTokenSource.Token).Wait();
            //       }
            //       finally
            //       {
            //           this.runCompleteEvent.Set();
            //       }
        }