public List <string> getUrl(string title) { title = Robotom.CleanWord(title); if (searchcache == null || searchcache.Count > 100) { searchcache = new Dictionary <string, Tuple <List <string>, DateTime> >(); } if (searchcache.ContainsKey(title)) { if (searchcache[title].Item2.AddMinutes(30) > DateTime.Now) { return(searchcache[title].Item1); } else { searchcache.Remove(title); } } List <string> returnthis = new List <string>(); CloudStorageAccount storageAccount = CloudStorageAccount.Parse( ConfigurationManager.AppSettings["StorageConnectionString"]); CloudTableClient tableClient = storageAccount.CreateCloudTableClient(); CloudTable statsTable = tableClient.GetTableReference("crawltable"); try { var results = new Dictionary <string, int>(); foreach (string word in title.Split(' ')) { TableQuery <UriEntity> query = new TableQuery <UriEntity>() .Where( TableQuery.GenerateFilterCondition("PartitionKey", QueryComparisons.Equal, word)); var stuffs = statsTable.ExecuteQuery(query); foreach (UriEntity stuff in stuffs) { string key = stuff.Title + " " + stuff.Site; if (results.ContainsKey(key)) { results[key]++; } else { results.Add(key, 1); } } } returnthis = results.OrderByDescending(x => x.Value).Select(x => x.Key).Take(20).ToList(); } catch (Exception e) { returnthis.Add("Could not access table: " + e.Message); } if (returnthis.Count == 0) { returnthis.Add("Could not find any results..."); } searchcache.Add(title, new Tuple <List <string>, DateTime>(returnthis, DateTime.Now)); return(returnthis); }
//return 1 for success, 0 for fail public int ParseHtml(Uri uri) { if (isDisallowed(uri)) { queueCount--; return(0); } long start = DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond; string sitedata = ""; try { WebClient downloader = new WebClient(); sitedata = downloader.DownloadString(uri); } catch (Exception e) { UriEntity error = new UriEntity(uri, e.Message, DateTime.Now, e.Message); errorTable.ExecuteAsync(TableOperation.Insert(error)); parsed.Add(uri.AbsoluteUri); visited.Remove(uri.AbsoluteUri); queueCount--; return(0); } string hi = uri.AbsoluteUri; HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(sitedata); HtmlNodeCollection hrefs = doc.DocumentNode.SelectNodes("//a[@href]"); if (hrefs == null) { queueCount--; return(0); } foreach (HtmlNode node in hrefs) { var href = node.Attributes["href"]; string url = href.Value; //remove this if crawler break try { Uri newsite = new Uri(uri, url); string host = newsite.Host; if (host.Equals("cnn.com") || host.Equals("www.cnn.com") || newsite.AbsoluteUri.StartsWith("http://bleacherreport.com/articles")) { if (!visited.Contains(newsite.AbsoluteUri) && !parsed.Contains(newsite.AbsoluteUri)) { htmlQ.AddMessageAsync(new CloudQueueMessage(newsite.AbsoluteUri)); visited.Add(newsite.AbsoluteUri); queueCount++; } } } catch (Exception e) { } //to here //if (url.StartsWith("/") && !url.StartsWith("//")) //{ // Uri test = new Uri("http://" + uri.Host + url); // if (!visited.Contains(test.AbsoluteUri) && !parsed.Contains(test.AbsoluteUri)) // { // htmlQ.AddMessageAsync(new CloudQueueMessage(test.AbsoluteUri)); // visited.Add(test.AbsoluteUri); // queueCount++; // } //} //else if (url.StartsWith("http://bleacherreport.com/articles")) //{ // Uri test = new Uri(url); // if (!visited.Contains(test.AbsoluteUri) && !parsed.Contains(test.AbsoluteUri)) // { // htmlQ.AddMessageAsync(new CloudQueueMessage(test.AbsoluteUri)); // visited.Add(test.AbsoluteUri); // queueCount++; // } //} } long stop = DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond; timer = stop - start; //get title HtmlNode titleNode = doc.DocumentNode.SelectSingleNode("//title"); string title = ""; if (titleNode != null) { title = titleNode.InnerText; } //get date HtmlNode lastmod = doc.DocumentNode.SelectSingleNode("//meta[@name='lastmod']"); if (uri.Host.Equals("bleacherreport.com")) { lastmod = doc.DocumentNode.SelectSingleNode("//meta[@name='pubdate']"); } string date = ""; if (lastmod != null) { date = lastmod.GetAttributeValue("content", ""); } DateTime converteddate = date.Equals("") ? new DateTime() : Convert.ToDateTime(date); HashSet <UriEntity> words = new HashSet <UriEntity>(); foreach (string word in Robotom.CleanWord(title).Split(' ')) { if (!word.Trim().Equals("")) { words.Add(new UriEntity(uri, title, converteddate, word)); } } try { if (!parsed.Contains(uri.AbsoluteUri)) { foreach (UriEntity add in words) { resultTable.ExecuteAsync(TableOperation.Insert(add)); tableCount++; } lastTen.Enqueue(uri + " - \"" + title + "\""); if (lastTen.Count > 10) { lastTen.Dequeue(); } } } catch (Exception e) { } parsed.Add(uri.AbsoluteUri); visited.Remove(uri.AbsoluteUri); queueCount--; return(1); }