コード例 #1
0
        /// <summary>
        /// A static method that gets a unique word count for each of the words in a string
        /// </summary>
        /// <param name="str">The String that will be broken into a distinct word count</param>
        /// <returns>A distinct word count in the form of a dictionary(word, count)</returns>
        public static Dictionary <string, double> GetWordCount(this string str)
        {
            //Check to see that the user pased an actual string
            //If they didn't return them an empty dictionary
            if (String.IsNullOrEmpty(str))
            {
                return(new Dictionary <string, double>());
            }
            //Create the stemmer used to impliment Porters Algorithm for stemming strings
            //The purpose of this is to take words like lovely and convert them to love,
            //This helps attain more accurate results
            var stemmer = new PorterStemmerAlgorithm.PorterStemmer();
            //A dummy double used as the output for the Double.TryParse
            //This eliminates numbers from the
            Double num;
            Regex  rgx = new Regex("[^a-zA-Z0-9]");

            str = rgx.Replace(str, " ");
            //Split the words first removing _ characters
            return((new Regex(@"\w(?<!\d)[\w'-]*")).Matches(str)
                   //Cast them to an enumerable of the matches.
                   .Cast <Match>()
                   //Convert the strings to lower, Stem them for consistency and select them.
                   .Select(m => stemmer.stemTerm(m.Value.ToLower()))
                   //Group Them by their text
                   .GroupBy(p => p)
                   //Select a new object where the Word is the text and the Count is the number of occurences of that word
                   .Select(g => new { Word = g.Key, Count = g.Count() })
                   //Order them by word (not necessary but I like order)
                   .OrderBy(p => p.Word)
                   //Remove all items that are found in the stop words dictionary, or are simply numbers
                   .Where(p => !StopWords.ContainsKey(p.Word) && !Double.TryParse(p.Word, out num))
                   //Convert this list to a dictionary where the word is the key and the number of its occurences is the value
                   .ToDictionary(p => p.Word, p => Convert.ToDouble(p.Count)));
        }