public List <string> Partition(string input, StopWordsHandler stopWords, bool useWordStemmer = false) { Regex r = new Regex("([ \\t{}():;., \n])"); input = input.ToLower(); String [] tokens = r.Split(input); List <string> words = new List <string>(); for (int i = 0; i < tokens.Length; i++) { MatchCollection mc = r.Matches(tokens[i]); if (mc.Count <= 0 && tokens[i].Trim().Length > 0 && !stopWords.IsStopword(tokens[i])) { words.Add(tokens[i]); } } if (useWordStemmer) { // Process the word list with an implementation of Martin Porter's word stemmer algorithm. // This will reduce the words contained in the array to their "root" forms. PorterStemmer stemmer = new PorterStemmer(); for (int i = 0; i < words.Count; i++) { words[i] = stemmer.stemTerm(words[i]); } } return(words); }
/// <summary> /// Trains this Category from a word or phrase<\summary> /// <seealso cref="DePhrase(string)"> /// See DePhrase </seealso> public void TeachPhrase(string rawPhrase, bool useWordStemmer = false) { rawPhrase = rawPhrase.ToLower(); // Eliminate case-sensitivity if ((null != m_Excluded) && (m_Excluded.IsStopword(rawPhrase))) { return; } BayesPhraseCount pc; string Phrase = GetWordStem(DePhrase(rawPhrase), useWordStemmer); rawPhrase = GetWordStem(rawPhrase, useWordStemmer); // Reduce the words to their "stems" so that something like "birds" matches "bird" if (!m_Phrases.TryGetValue(Phrase, out pc)) { pc = new BayesPhraseCount(rawPhrase); m_Phrases.Add(Phrase, pc); } pc.Count++; m_TotalWords++; }