/// <summary>
		/// Gets a Hashtable of words and integers representing the number of each word.
		/// </summary>
		/// <param name="input">The string to get the word frequency of.</param>
		/// <param name="caseSensitive">True if words should be treated as separate if they have different casing.</param>
		/// <param name="tokenizer">A instance of ITokenizer.</param>
		/// <param name="stopWordProvider">An instance of IStopWordProvider.</param>
		/// <returns></returns>
		public static Hashtable GetWordFrequency(string input, bool caseSensitive, ITokenizer tokenizer, IStopWordProvider stopWordProvider)
		{
			string convertedInput = input;
			if (!caseSensitive)
				convertedInput = input.ToLower();

			string[] words = tokenizer.Tokenize(convertedInput);
			Array.Sort(words);

			string[] uniqueWords = GetUniqueWords(words);

			Hashtable result = new Hashtable();
			for (int i = 0; i < uniqueWords.Length; i++)
			{
				if (stopWordProvider == null || (IsWord(uniqueWords[i]) && !stopWordProvider.IsStopWord(uniqueWords[i])))
				{
					if (result.ContainsKey(uniqueWords[i]))
						result[uniqueWords[i]] = (int)result[uniqueWords[i]] + CountWords(uniqueWords[i], words);
					else
						result.Add(uniqueWords[i], CountWords(uniqueWords[i], words));
				}
			}

			return result;
		}
        public BayesianClassifier(IWordsDataSource wd, ITokenizer tokenizer, IStopWordProvider swp)
        {
            if (wd == null)
                throw new ArgumentNullException("IWordsDataSource cannot be null.");
            _wordsData = wd;

            if (tokenizer == null)
                throw new ArgumentNullException("ITokenizer cannot be null.");
            _tokenizer = tokenizer;

            if (swp == null)
                throw new ArgumentNullException("IStopWordProvider cannot be null.");
            _stopWordProvider = swp;
        }
        public BayesianClassifier(IWordsDataSource wordsDataSource, ITokenizer tokenizer, IStopWordProvider stopWordProvider)
        {
            if (wordsDataSource == null)
            {
                throw new ArgumentNullException("wordsDataSource");
            }

            _wordsData = wordsDataSource;

            if (tokenizer == null)
            {
                throw new ArgumentNullException("tokenizer");
            }

            _tokenizer = tokenizer;

            if (stopWordProvider == null)
            {
                throw new ArgumentNullException("stopWordProvider");
            }

            _stopWordProvider = stopWordProvider;
        }
Exemple #4
0
        /// <summary>
        /// Gets a dictionary of words and integers representing the number of each word.
        /// </summary>
        /// <param name="input">The string to get the word frequency of.</param>
        /// <param name="caseSensitive">True if words should be treated as separate if they have different casing.</param>
        /// <param name="tokenizer">A instance of ITokenizer.</param>
        /// <param name="stopWordProvider">An instance of IStopWordProvider.</param>
        /// <returns></returns>
        public static IDictionary<string, int> GetWordFrequency(string input, bool caseSensitive, ITokenizer tokenizer, IStopWordProvider stopWordProvider)
        {
            var convertedInput = input;
            if (!caseSensitive)
            {
                convertedInput = input.ToLower();
            }

            var words = tokenizer.Tokenize(convertedInput);
            Array.Sort(words);

            var uniqueWords = GetUniqueWords(words);

            var result = new Dictionary<string, int>();
            for (var i = 0; i < uniqueWords.Length; i++)
            {
                var word = uniqueWords[i];

                if (stopWordProvider == null || (IsWord(word) && !stopWordProvider.IsStopWord(word)))
                {
                    int value;
                    if (result.TryGetValue(word, out value))
                    {
                        result[word] = value + CountWords(word, words);
                    }
                    else
                    {
                        result.Add(word, CountWords(word, words));
                    }
                }
            }

            return result;
        }
 public StopWordService(IStopWordProvider provider)
 {
     _provider = provider;
 }