/// <summary> /// A static method that gets a unique word count for each of the words in a string /// </summary> /// <param name="str">The String that will be broken into a distinct word count</param> /// <returns>A distinct word count in the form of a dictionary(word, count)</returns> public static Dictionary <string, double> GetWordCount(this string str) { //Check to see that the user pased an actual string //If they didn't return them an empty dictionary if (String.IsNullOrEmpty(str)) { return(new Dictionary <string, double>()); } //Create the stemmer used to impliment Porters Algorithm for stemming strings //The purpose of this is to take words like lovely and convert them to love, //This helps attain more accurate results var stemmer = new PorterStemmerAlgorithm.PorterStemmer(); //A dummy double used as the output for the Double.TryParse //This eliminates numbers from the Double num; Regex rgx = new Regex("[^a-zA-Z0-9]"); str = rgx.Replace(str, " "); //Split the words first removing _ characters return((new Regex(@"\w(?<!\d)[\w'-]*")).Matches(str) //Cast them to an enumerable of the matches. .Cast <Match>() //Convert the strings to lower, Stem them for consistency and select them. .Select(m => stemmer.stemTerm(m.Value.ToLower())) //Group Them by their text .GroupBy(p => p) //Select a new object where the Word is the text and the Count is the number of occurences of that word .Select(g => new { Word = g.Key, Count = g.Count() }) //Order them by word (not necessary but I like order) .OrderBy(p => p.Word) //Remove all items that are found in the stop words dictionary, or are simply numbers .Where(p => !StopWords.ContainsKey(p.Word) && !Double.TryParse(p.Word, out num)) //Convert this list to a dictionary where the word is the key and the number of its occurences is the value .ToDictionary(p => p.Word, p => Convert.ToDouble(p.Count))); }