public ViewResult Index(String searchingText) { try { Task[] tasks = new Task[preferences.parsers.Count]; int i = 0; preferences.parsers.ForEach(p => { tasks[i] = p.LoadPage(searchingText); i++; }); int index = Task.WaitAny(tasks); List <String> result = preferences.parsers[index].Parse(); foreach (var r in result) { FoundUrl url = new FoundUrl(); url.Url = r; url.Engine = preferences.parsers[index].searchEngineUrl; url.DateFound = DateTime.Now; context.Add(url); } context.SaveChanges(); return(View(result)); } catch (AggregateException ex) { List <String> result = new List <String>(); foreach (var e in ex.InnerExceptions) { result.Add(e.Message); } return(View(result)); } }
/// <summary> /// Crawls a given URL, queueing all found URLs and storing information about /// the given URL for later querying. /// </summary> /// <param name="data"> /// Crawler data helper. Ref. /// </param> /// <param name="storage"> /// Crawler azure storage helper. Ref. /// </param> /// <param name="url"> /// The given URL to crawl. /// </param> public static void CrawlUrl(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string url) { if (data.ChkIfUriAllowed(url)) { ///* Unsure if necessary. */ //if (!url.Contains(".htm")) //{ // if (!url.Contains(".jpg") && !url.Contains(".png")) // { // if (url.EndsWith(@"/")) // { // url += "index.html"; // } // else // { // url += @"/index.html"; // } // } //} try { var web = new HtmlWeb(); var currDoc = web.Load(url); var urlNodes = currDoc.DocumentNode.Descendants("a") .ToList(); var urlPageTitle = currDoc.DocumentNode.Descendants("title") .First() .InnerText; var urlLastModNode = currDoc.DocumentNode.Descendants("meta") .Select(y => y) .Where(y => y.Attributes.Contains("name")) .Where(y => y.Attributes["name"].Value == "pubdate") .ToList(); DateTime?urlLastMod = null; if (urlLastModNode.Count > 0) { urlLastMod = DateTime.Parse( urlLastModNode.First().Attributes["content"].Value); } List <string> urlsToQueue = new List <string>(); foreach (var urlNode in urlNodes) { if (urlNode.Attributes.Contains("href")) { urlsToQueue.Add(urlNode.Attributes["href"].Value); } } foreach (string newUrl in urlsToQueue) { ChkAndAddUrl(newUrl, url, urlLastMod, ref data, ref storage); } if (!data.AddedUrls.Contains(url)) { data.AddedUrls.Add(url); data.NumUrlsIndexed++; } data.NumUrlsCrawled++; FoundUrl finishedUrl = new FoundUrl(urlPageTitle, (urlLastMod != null ? urlLastMod.ToString() : "NULL"), url); UrlTableCount newCount = new UrlTableCount(data.NumUrlsCrawled, data.NumUrlsIndexed); TableOperation insertUrl = TableOperation.InsertOrReplace(finishedUrl); TableOperation insertCount = TableOperation.InsertOrReplace(newCount); storage.UrlTable.Execute(insertUrl); storage.UrlTable.Execute(insertCount); if (data.LastTenUrls.Count == 10) { data.LastTenUrls.Dequeue(); } data.LastTenUrls.Enqueue(url); } catch (Exception ex) { ErrorUrl errorUrl = new ErrorUrl(url, ex.ToString()); TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl); storage.ErrorTable.Execute(insertErrorUrl); } } }