Example #1
0
        internal static string[] GenerateNGrams(string text, int gramLength)
        {
            if (text == null || text.Length == 0)
            {
                return(null);
            }

            ArrayList grams  = new ArrayList();
            int       length = text.Length;

            if (length < gramLength)
            {
                string gram;
                for (int i = 1; i <= length; i++)
                {
                    gram = text.Substring(0, (i) - (0));
                    if (grams.IndexOf(gram) == -1)
                    {
                        grams.Add(gram);
                    }
                }

                gram = text.Substring(length - 1, (length) - (length - 1));
                if (grams.IndexOf(gram) == -1)
                {
                    grams.Add(gram);
                }
            }
            else
            {
                for (int i = 1; i <= gramLength - 1; i++)
                {
                    string gram = text.Substring(0, (i) - (0));
                    if (grams.IndexOf(gram) == -1)
                    {
                        grams.Add(gram);
                    }
                }

                for (int i = 0; i < (length - gramLength) + 1; i++)
                {
                    string gram = text.Substring(i, (i + gramLength) - (i));
                    if (grams.IndexOf(gram) == -1)
                    {
                        grams.Add(gram);
                    }
                }

                for (int i = (length - gramLength) + 1; i < length; i++)
                {
                    string gram = text.Substring(i, (length) - (i));
                    if (grams.IndexOf(gram) == -1)
                    {
                        grams.Add(gram);
                    }
                }
            }
            return(Tokeniser.ArrayListToArray(grams));
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="docs"></param>
        /// <returns></returns>
        private ArrayList GenerateTerms(string[] docs)
        {
            ArrayList uniques = new ArrayList();

            _ngramDoc = new string[_numDocs][];
            for (int i = 0; i < docs.Length; i++)
            {
                Tokeniser tokenizer = new Tokeniser();
                string[]  words     = tokenizer.Partition(docs[i]);

                for (int j = 0; j < words.Length; j++)
                {
                    if (!uniques.Contains(words[j]))
                    {
                        uniques.Add(words[j]);
                    }
                }
            }
            return(uniques);
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="input"></param>
        /// <returns></returns>
        private string[] GetDistinctWords(String[] input)
        {
            if (input == null)
            {
                return(new string[0]);
            }
            else
            {
                ArrayList list = new ArrayList();

                for (int i = 0; i < input.Length; i++)
                {
                    if (!list.Contains(input[i])) // N-GRAM SIMILARITY?
                    {
                        list.Add(input[i]);
                    }
                }

                return(Tokeniser.ArrayListToArray(list));
            }
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="input"></param>
        /// <returns></returns>
        private IDictionary GetWordFrequency(string input)
        {
            string convertedInput = input.ToLower();

            Tokeniser tokenizer = new Tokeniser();

            String[] words = tokenizer.Partition(convertedInput);
            Array.Sort(words);

            String[] distinctWords = GetDistinctWords(words);

            IDictionary result = new Hashtable();

            for (int i = 0; i < distinctWords.Length; i++)
            {
                object tmp;
                tmp = CountWords(distinctWords[i], words);
                result[distinctWords[i]] = tmp;
            }

            return(result);
        }
Example #5
0
		private ArrayList GenerateTerms(string[] docs)
		{
			ArrayList uniques=new ArrayList() ;
			_ngramDoc=new string[_numDocs][] ;
			for (int i=0; i < docs.Length ; i++)
			{
				Tokeniser tokenizer=new Tokeniser() ;
				string[] words=tokenizer.Partition(docs[i]);			

				for (int j=0; j < words.Length ; j++)
					if (!uniques.Contains(words[j]) )				
						uniques.Add(words[j]) ;
								
			}
			return uniques;
		}
Example #6
0
		private IDictionary GetWordFrequency(string input)
		{
			string convertedInput=input.ToLower() ;
					
			Tokeniser tokenizer=new Tokeniser() ;
			String[] words=tokenizer.Partition(convertedInput);			
			Array.Sort(words);
			
			String[] distinctWords=GetDistinctWords(words);
						
			IDictionary result=new Hashtable();
			for (int i=0; i < distinctWords.Length; i++)
			{
				object tmp;
				tmp=CountWords(distinctWords[i], words);
				result[distinctWords[i]]=tmp;
				
			}
			
			return result;
		}