public static void RetrieveGoogleContextSentences(string word, string langCode, out List <string> titleSentences, out List <string> textSentences) { if (GoogleContextCache.IsInCache(word, langCode)) { GoogleContextCache.LoadFromCache(word, langCode, out titleSentences, out textSentences); Console.WriteLine("Loaded cached Google context: {0} ({1})", word, langCode); } else { Console.WriteLine("Finding Google context: {0} ({1})", word, langCode); // Get the Google context as RAW string string googleContext = RetrieveGoogleContextAsString(word, langCode); // Process search results titles titleSentences = new List <string>(); Match match = Regex.Match(googleContext, @"class=l[^>]*?>(.*?)</a>", RegexOptions.Singleline); while (match.Success) { string title = match.Groups[1].Value; string titleDecoded = HttpUtility.HtmlDecode(title); string titleWithoutTags = HtmlParsingUtils.RemoveHtmlTags(titleDecoded); titleSentences.Add(titleWithoutTags); match = match.NextMatch(); } // Process search results body texts textSentences = new List <string>(); match = Regex.Match(googleContext, @"<div class=""s"">(<span class=f>.*?</a>)?(.*?)<br><cite>", RegexOptions.Singleline); // If we have PDF/DOC/PPT file --> skip it's contents is in <span class=f> ... </a> while (match.Success) { string text = match.Groups[2].Value; string textDecoded = HttpUtility.HtmlDecode(text); string textWithoutTags = HtmlParsingUtils.RemoveHtmlTags(textDecoded); textSentences.Add(textWithoutTags); match = match.NextMatch(); } GoogleContextCache.AddToCache(word, langCode, titleSentences, textSentences); } }
public static void RetrieveMSNContextSentences(string word, string langCode, out List <string> titleSentences, out List <string> textSentences) { if (MSNContextCache.IsInCache(word, langCode)) { MSNContextCache.LoadFromCache(word, langCode, out titleSentences, out textSentences); Console.WriteLine("Loaded cached MSN context: {0} ({1})", word, langCode); } else { Console.WriteLine("Finding MSN context: {0} ({1})", word, langCode); // Get the MSN context as raw HTML string string msnContext = RetrieveMSNContextAsString(word, langCode); // Process search results titles titleSentences = new List <string>(); Match match = Regex.Match(msnContext, @"<li[^>]*?>\s*<h3>(.*?)</h3>", RegexOptions.Singleline); while (match.Success) { string title = match.Groups[1].Value; string titleDecoded = HttpUtility.HtmlDecode(title); string titleWithoutTags = HtmlParsingUtils.RemoveHtmlTags(titleDecoded); titleSentences.Add(titleWithoutTags); match = match.NextMatch(); } // Process search results body text textSentences = new List <string>(); match = Regex.Match(msnContext, @"</h3>\s*<p>(.*?)</p>", RegexOptions.Singleline); while (match.Success) { string text = match.Groups[1].Value; string textDecoded = HttpUtility.HtmlDecode(text); string textWithoutTags = HtmlParsingUtils.RemoveHtmlTags(textDecoded); textSentences.Add(textWithoutTags); match = match.NextMatch(); } MSNContextCache.AddToCache(word, langCode, titleSentences, textSentences); } }