Exemplo n.º 1
0
        public override void Run()
        {
            Trace.TraceInformation("WorkerRole1 is running");
            CloudStorageAccount storageAccount = CloudStorageAccount.Parse(ConfigurationManager.AppSettings["StorageConnectionString"]);
            CloudTableClient    tableClient    = storageAccount.CreateCloudTableClient();
            CloudQueueClient    queueClient    = storageAccount.CreateCloudQueueClient();

            table       = tableClient.GetTableReference("htmlURLs");
            HTMLs       = queueClient.GetQueueReference("urls");
            messages    = queueClient.GetQueueReference("status");
            visitedURLs = queueClient.GetQueueReference("visited");
            bool robotsparsed        = false;
            CloudQueueMessage status = null;

            lastTen = new List <String>();
            while (true)
            {
                Thread.Sleep(10);
                if (messages.Exists())
                {
                    status = messages.PeekMessage();
                }
                if (status != null && status.AsString.Equals("Start") && HTMLs != null && !robotsparsed)
                {
                    spider = new Crawler();
                    List <string> urls  = spider.crawlRobots();
                    List <string> urls2 = spider.cnnRobotsCrawl();
                    index       = 0;
                    urlsCrawled = 0;
                    foreach (string url in urls)
                    {
                        HTMLs.AddMessage(new CloudQueueMessage(url));
                    }
                    foreach (string url in urls2)
                    {
                        HTMLs.AddMessage(new CloudQueueMessage(url));
                    }
                    robotsparsed = true;
                    messages.DeleteMessage(messages.GetMessage(TimeSpan.FromMinutes(5)));
                    messages.AddMessage(new CloudQueueMessage("Crawling"));
                }
                else if (status != null && !status.AsString.Equals("Stop") && HTMLs != null && robotsparsed)
                {
                    CloudQueueMessage message = HTMLs.GetMessage(TimeSpan.FromMinutes(5));
                    if (message != null)
                    {
                        HTMLs.DeleteMessage(message);
                        string messageString = message.AsString;
                        if (spider.isAllowed(messageString))
                        {
                            List <string> newLinks = spider.crawlLink(messageString);
                            URL           entry    = null;
                            if (newLinks[0] == "Error")
                            {
                                entry = new URL(messageString, newLinks[0], "Error");
                            }
                            else
                            {
                                entry = new URL(messageString, newLinks[0], "Partition");
                            }
                            if (lastTen.Count < 10)
                            {
                                lastTen.Add(messageString);
                            }
                            else
                            {
                                lastTen.Remove(lastTen[0]);
                                lastTen.Add(messageString);
                            }
                            if (spider.visited().Contains(messageString))
                            {
                                index++;
                            }
                            urlsCrawled++;
                            TableOperation insertOperation = TableOperation.InsertOrReplace(entry);
                            table.ExecuteAsync(insertOperation);
                            //var result = table.BeginExecute(insertOperation,
                            //new AsyncCallback(onTableExecuteComplete), entity);
                            //result.AsyncWaitHandle.WaitOne();
                            if (newLinks.Count > 1)
                            {
                                for (int i = 2; i < newLinks.Count - 1; i++)
                                {
                                    HTMLs.AddMessageAsync(new CloudQueueMessage(newLinks[i]));
                                }
                            }
                        }
                        URL            tableEntry     = new URL("Index", index.ToString(), "IndexCount");
                        TableOperation indexOperation = TableOperation.InsertOrReplace(tableEntry);
                        table.Execute(indexOperation);
                        URL            tableEntry2       = new URL("urls", urlsCrawled.ToString(), "URLs");
                        TableOperation urlCountOperation = TableOperation.InsertOrReplace(tableEntry2);
                        table.Execute(urlCountOperation);
                    }
                }
            }
            //     try
            //       {
            //           this.RunAsync(this.cancellationTokenSource.Token).Wait();
            //       }
            //       finally
            //       {
            //           this.runCompleteEvent.Set();
            //       }
        }