//public IQueryable<WebPageEntity> WebPageEntities() //{ // using (EntityContext _context = new EntityContext()) // { // _context.WebPageEntities; // } //} public async Task Create(PageViewModel page) { WebPageEntity entity = new WebPageEntity() { Title = page.Title, HtmlSource = page.HtmlSource, Image = page.Image }; using (EntityContext _context = new EntityContext()) { _context.WebPageEntities.Add(entity); await _context.SaveChangesAsync(); } }
private void OnCrawlerStateCrawling() { WebPageEntity webpageEntity = new WebPageEntity(); List <string> qualifiedUrls = new List <string>(); XmlDocument doc = new XmlDocument(); // Retrieve storage account from connection string CloudStorageAccount storageAccount = CloudStorageAccount.Parse( CloudConfigurationManager.GetSetting("StorageConnectionString")); // Create the queue and table client CloudQueueClient queueClient = storageAccount.CreateCloudQueueClient(); CloudTableClient tableClient = storageAccount.CreateCloudTableClient(); // Retrieve a reference to CrawlingQueue and CrawlingTable CloudQueue crawlingQueue = queueClient.GetQueueReference(WebRole.AZURE_CRAWLING_QUEUE); CloudTable crawlingTable = tableClient.GetTableReference(WebRole.AZURE_CRAWLING_TABLE); // Create the queue and table if it doesn't already exist crawlingQueue.CreateIfNotExists(); crawlingTable.CreateIfNotExists(); CloudQueueMessage message = crawlingQueue.GetMessage(); if (message == null) { // No more webpage to crawl statsEntity.CrawlerState = CrawlerStates.Idle; return; } // Store URL to webpage entity webpageEntity.URL = message.AsString; // Load URL as XML document try { doc.Load(message.AsString); } catch (Exception) { // Remove problematic webpage and skip crawlingQueue.DeleteMessage(message); return; } // Store Title to webpage entity webpageEntity.Title = GetPageTitle(doc); // Put webpage entity to the crawling table crawlingTable.Execute(TableOperation.InsertOrReplace(webpageEntity)); // Add this URL to the crawled hashset crawledUrls.Add(webpageEntity.URL); // Add the qualified links in this URL to the CrawlingQueue qualifiedUrls = GetPageLinks(doc); foreach (string url in qualifiedUrls) { crawlingQueue.AddMessage(new CloudQueueMessage(url)); } // Update stats statsEntity.NumUrlsCrawled = statsEntity.NumUrlsCrawled + 1; statsEntity.Last10UrlsCrawled.Enqueue(webpageEntity.URL); while (statsEntity.Last10UrlsCrawled.Count > 10) { statsEntity.Last10UrlsCrawled.Dequeue(); } statsEntity.SizeOfQueue = statsEntity.SizeOfQueue + (uint)qualifiedUrls.Count - 1; statsEntity.SizeOfIndexInMByte = statsEntity.SizeOfIndexInMByte + webpageEntity.GetEntitySizeMByte(); // Put stats to table TableOperation insertOrReplaceOperation = TableOperation.InsertOrReplace(statsEntity); TableResult tr = crawlingTable.Execute(insertOrReplaceOperation); // After processing the URL, the client should delete it crawlingQueue.DeleteMessage(message); }
public async Task AddWebPageAsync(WebPage webPage) { var webPageEntity = new WebPageEntity(webPage.Query, webPage.Title, webPage.Link, webPage.Snippet, webPage.Engine); await _context.WebPages.AddAsync(webPageEntity); }