Esempio n. 1
0
        private void OnCrawlerStateCrawling()
        {
            WebPageEntity webpageEntity = new WebPageEntity();
            List <string> qualifiedUrls = new List <string>();
            XmlDocument   doc           = new XmlDocument();

            // Retrieve storage account from connection string
            CloudStorageAccount storageAccount = CloudStorageAccount.Parse(
                CloudConfigurationManager.GetSetting("StorageConnectionString"));

            // Create the queue and table client
            CloudQueueClient queueClient = storageAccount.CreateCloudQueueClient();
            CloudTableClient tableClient = storageAccount.CreateCloudTableClient();

            // Retrieve a reference to CrawlingQueue and CrawlingTable
            CloudQueue crawlingQueue = queueClient.GetQueueReference(WebRole.AZURE_CRAWLING_QUEUE);
            CloudTable crawlingTable = tableClient.GetTableReference(WebRole.AZURE_CRAWLING_TABLE);

            // Create the queue and table if it doesn't already exist
            crawlingQueue.CreateIfNotExists();
            crawlingTable.CreateIfNotExists();

            CloudQueueMessage message = crawlingQueue.GetMessage();

            if (message == null)
            {
                // No more webpage to crawl
                statsEntity.CrawlerState = CrawlerStates.Idle;
                return;
            }

            // Store URL to webpage entity
            webpageEntity.URL = message.AsString;

            // Load URL as XML document
            try
            {
                doc.Load(message.AsString);
            }
            catch (Exception)
            {
                // Remove problematic webpage and skip
                crawlingQueue.DeleteMessage(message);
                return;
            }

            // Store Title to webpage entity
            webpageEntity.Title = GetPageTitle(doc);

            // Put webpage entity to the crawling table
            crawlingTable.Execute(TableOperation.InsertOrReplace(webpageEntity));

            // Add this URL to the crawled hashset
            crawledUrls.Add(webpageEntity.URL);

            // Add the qualified links in this URL to the CrawlingQueue
            qualifiedUrls = GetPageLinks(doc);
            foreach (string url in qualifiedUrls)
            {
                crawlingQueue.AddMessage(new CloudQueueMessage(url));
            }

            // Update stats
            statsEntity.NumUrlsCrawled = statsEntity.NumUrlsCrawled + 1;
            statsEntity.Last10UrlsCrawled.Enqueue(webpageEntity.URL);
            while (statsEntity.Last10UrlsCrawled.Count > 10)
            {
                statsEntity.Last10UrlsCrawled.Dequeue();
            }
            statsEntity.SizeOfQueue        = statsEntity.SizeOfQueue + (uint)qualifiedUrls.Count - 1;
            statsEntity.SizeOfIndexInMByte = statsEntity.SizeOfIndexInMByte + webpageEntity.GetEntitySizeMByte();

            // Put stats to table
            TableOperation insertOrReplaceOperation = TableOperation.InsertOrReplace(statsEntity);
            TableResult    tr = crawlingTable.Execute(insertOrReplaceOperation);

            // After processing the URL, the client should delete it
            crawlingQueue.DeleteMessage(message);
        }