Esempio n. 1
0
        //public IQueryable<WebPageEntity> WebPageEntities()
        //{
        //    using (EntityContext _context = new EntityContext())
        //    {
        //         _context.WebPageEntities;
        //    }
        //}

        public async Task Create(PageViewModel page)
        {
            WebPageEntity entity = new WebPageEntity()
            {
                Title      = page.Title,
                HtmlSource = page.HtmlSource,
                Image      = page.Image
            };

            using (EntityContext _context = new EntityContext())
            {
                _context.WebPageEntities.Add(entity);
                await _context.SaveChangesAsync();
            }
        }
Esempio n. 2
0
        private void OnCrawlerStateCrawling()
        {
            WebPageEntity webpageEntity = new WebPageEntity();
            List <string> qualifiedUrls = new List <string>();
            XmlDocument   doc           = new XmlDocument();

            // Retrieve storage account from connection string
            CloudStorageAccount storageAccount = CloudStorageAccount.Parse(
                CloudConfigurationManager.GetSetting("StorageConnectionString"));

            // Create the queue and table client
            CloudQueueClient queueClient = storageAccount.CreateCloudQueueClient();
            CloudTableClient tableClient = storageAccount.CreateCloudTableClient();

            // Retrieve a reference to CrawlingQueue and CrawlingTable
            CloudQueue crawlingQueue = queueClient.GetQueueReference(WebRole.AZURE_CRAWLING_QUEUE);
            CloudTable crawlingTable = tableClient.GetTableReference(WebRole.AZURE_CRAWLING_TABLE);

            // Create the queue and table if it doesn't already exist
            crawlingQueue.CreateIfNotExists();
            crawlingTable.CreateIfNotExists();

            CloudQueueMessage message = crawlingQueue.GetMessage();

            if (message == null)
            {
                // No more webpage to crawl
                statsEntity.CrawlerState = CrawlerStates.Idle;
                return;
            }

            // Store URL to webpage entity
            webpageEntity.URL = message.AsString;

            // Load URL as XML document
            try
            {
                doc.Load(message.AsString);
            }
            catch (Exception)
            {
                // Remove problematic webpage and skip
                crawlingQueue.DeleteMessage(message);
                return;
            }

            // Store Title to webpage entity
            webpageEntity.Title = GetPageTitle(doc);

            // Put webpage entity to the crawling table
            crawlingTable.Execute(TableOperation.InsertOrReplace(webpageEntity));

            // Add this URL to the crawled hashset
            crawledUrls.Add(webpageEntity.URL);

            // Add the qualified links in this URL to the CrawlingQueue
            qualifiedUrls = GetPageLinks(doc);
            foreach (string url in qualifiedUrls)
            {
                crawlingQueue.AddMessage(new CloudQueueMessage(url));
            }

            // Update stats
            statsEntity.NumUrlsCrawled = statsEntity.NumUrlsCrawled + 1;
            statsEntity.Last10UrlsCrawled.Enqueue(webpageEntity.URL);
            while (statsEntity.Last10UrlsCrawled.Count > 10)
            {
                statsEntity.Last10UrlsCrawled.Dequeue();
            }
            statsEntity.SizeOfQueue        = statsEntity.SizeOfQueue + (uint)qualifiedUrls.Count - 1;
            statsEntity.SizeOfIndexInMByte = statsEntity.SizeOfIndexInMByte + webpageEntity.GetEntitySizeMByte();

            // Put stats to table
            TableOperation insertOrReplaceOperation = TableOperation.InsertOrReplace(statsEntity);
            TableResult    tr = crawlingTable.Execute(insertOrReplaceOperation);

            // After processing the URL, the client should delete it
            crawlingQueue.DeleteMessage(message);
        }
Esempio n. 3
0
 public async Task AddWebPageAsync(WebPage webPage)
 {
     var webPageEntity = new WebPageEntity(webPage.Query, webPage.Title, webPage.Link, webPage.Snippet, webPage.Engine);
     await _context.WebPages.AddAsync(webPageEntity);
 }