Esempio n. 1
0
        public List <string> Partition(string input, StopWordsHandler stopWords, bool useWordStemmer = false)
        {
            Regex r = new Regex("([ \\t{}():;., \n])");

            input = input.ToLower();

            String [] tokens = r.Split(input);

            List <string> words = new List <string>();

            for (int i = 0; i < tokens.Length; i++)
            {
                MatchCollection mc = r.Matches(tokens[i]);
                if (mc.Count <= 0 && tokens[i].Trim().Length > 0 &&
                    !stopWords.IsStopword(tokens[i]))
                {
                    words.Add(tokens[i]);
                }
            }

            if (useWordStemmer)
            {
                // Process the word list with an implementation of Martin Porter's word stemmer algorithm.
                // This will reduce the words contained in the array to their "root" forms.
                PorterStemmer stemmer = new PorterStemmer();
                for (int i = 0; i < words.Count; i++)
                {
                    words[i] = stemmer.stemTerm(words[i]);
                }
            }

            return(words);
        }
        /// <summary>
        /// Trains this Category from a word or phrase<\summary>
        /// <seealso cref="DePhrase(string)">
        /// See DePhrase </seealso>
        public void TeachPhrase(string rawPhrase, bool useWordStemmer = false)
        {
            rawPhrase = rawPhrase.ToLower();    // Eliminate case-sensitivity

            if ((null != m_Excluded) && (m_Excluded.IsStopword(rawPhrase)))
            {
                return;
            }

            BayesPhraseCount pc;
            string           Phrase = GetWordStem(DePhrase(rawPhrase), useWordStemmer);

            rawPhrase = GetWordStem(rawPhrase, useWordStemmer);   // Reduce the words to their "stems" so that something like "birds" matches "bird"
            if (!m_Phrases.TryGetValue(Phrase, out pc))
            {
                pc = new BayesPhraseCount(rawPhrase);
                m_Phrases.Add(Phrase, pc);
            }
            pc.Count++;
            m_TotalWords++;
        }