/// <summary> /// Trains this Category from a word or phrase<\summary> /// <seealso cref="DePhrase(string)"> /// See DePhrase </seealso> public void TeachPhrase(string rawPhrase, bool useWordStemmer = false) { rawPhrase = rawPhrase.ToLower(); // Eliminate case-sensitivity if ((null != m_Excluded) && (m_Excluded.IsStopword(rawPhrase))) { return; } BayesPhraseCount pc; string Phrase = GetWordStem(DePhrase(rawPhrase), useWordStemmer); rawPhrase = GetWordStem(rawPhrase, useWordStemmer); // Reduce the words to their "stems" so that something like "birds" matches "bird" if (!m_Phrases.TryGetValue(Phrase, out pc)) { pc = new BayesPhraseCount(rawPhrase); m_Phrases.Add(Phrase, pc); } pc.Count++; m_TotalWords++; }
/// <summary> /// Classifies a text<\summary> /// <returns> /// returns classification values for the text, the higher, the better is the match.</returns> public Dictionary <string, double> Classify(System.IO.StreamReader tr, bool useWordStemmer = false) { Dictionary <string, double> score = new Dictionary <string, double>(); foreach (KeyValuePair <string, IBayesCategory> cat in m_Categories) { score.Add(cat.Value.Name, 0.0); } BayesEnumerableCategory words_in_file = new BayesEnumerableCategory("", m_ExcludedWords); words_in_file.TeachCategory(tr, useWordStemmer); foreach (KeyValuePair <string, BayesPhraseCount> kvp1 in words_in_file) { BayesPhraseCount pc_in_file = kvp1.Value; foreach (KeyValuePair <string, IBayesCategory> kvp in m_Categories) { IBayesCategory cat = kvp.Value; int count = cat.GetPhraseCount(pc_in_file.RawPhrase); if (0 < count) { score[cat.Name] += System.Math.Log((double)count / (double)cat.TotalWords); } else { score[cat.Name] += System.Math.Log(0.01 / (double)cat.TotalWords); } //System.Diagnostics.Trace.WriteLine(string.Format("{0}({1}){2}", pc_in_file.RawPhrase.ToString(), cat.Name, score[cat.Name])); } } int totalWordsinCats = this.CountTotalWordsInCategories(); foreach (KeyValuePair <string, IBayesCategory> kvp in m_Categories) { IBayesCategory cat = kvp.Value; score[cat.Name] += System.Math.Log((double)cat.TotalWords / (double)totalWordsinCats); } return(score); }