internal static string[] GenerateNGrams(string text, int gramLength) { if (text == null || text.Length == 0) { return(null); } ArrayList grams = new ArrayList(); int length = text.Length; if (length < gramLength) { string gram; for (int i = 1; i <= length; i++) { gram = text.Substring(0, (i) - (0)); if (grams.IndexOf(gram) == -1) { grams.Add(gram); } } gram = text.Substring(length - 1, (length) - (length - 1)); if (grams.IndexOf(gram) == -1) { grams.Add(gram); } } else { for (int i = 1; i <= gramLength - 1; i++) { string gram = text.Substring(0, (i) - (0)); if (grams.IndexOf(gram) == -1) { grams.Add(gram); } } for (int i = 0; i < (length - gramLength) + 1; i++) { string gram = text.Substring(i, (i + gramLength) - (i)); if (grams.IndexOf(gram) == -1) { grams.Add(gram); } } for (int i = (length - gramLength) + 1; i < length; i++) { string gram = text.Substring(i, (length) - (i)); if (grams.IndexOf(gram) == -1) { grams.Add(gram); } } } return(Tokeniser.ArrayListToArray(grams)); }
/// <summary> /// /// </summary> /// <param name="docs"></param> /// <returns></returns> private ArrayList GenerateTerms(string[] docs) { ArrayList uniques = new ArrayList(); _ngramDoc = new string[_numDocs][]; for (int i = 0; i < docs.Length; i++) { Tokeniser tokenizer = new Tokeniser(); string[] words = tokenizer.Partition(docs[i]); for (int j = 0; j < words.Length; j++) { if (!uniques.Contains(words[j])) { uniques.Add(words[j]); } } } return(uniques); }
/// <summary> /// /// </summary> /// <param name="input"></param> /// <returns></returns> private string[] GetDistinctWords(String[] input) { if (input == null) { return(new string[0]); } else { ArrayList list = new ArrayList(); for (int i = 0; i < input.Length; i++) { if (!list.Contains(input[i])) // N-GRAM SIMILARITY? { list.Add(input[i]); } } return(Tokeniser.ArrayListToArray(list)); } }
/// <summary> /// /// </summary> /// <param name="input"></param> /// <returns></returns> private IDictionary GetWordFrequency(string input) { string convertedInput = input.ToLower(); Tokeniser tokenizer = new Tokeniser(); String[] words = tokenizer.Partition(convertedInput); Array.Sort(words); String[] distinctWords = GetDistinctWords(words); IDictionary result = new Hashtable(); for (int i = 0; i < distinctWords.Length; i++) { object tmp; tmp = CountWords(distinctWords[i], words); result[distinctWords[i]] = tmp; } return(result); }
private ArrayList GenerateTerms(string[] docs) { ArrayList uniques=new ArrayList() ; _ngramDoc=new string[_numDocs][] ; for (int i=0; i < docs.Length ; i++) { Tokeniser tokenizer=new Tokeniser() ; string[] words=tokenizer.Partition(docs[i]); for (int j=0; j < words.Length ; j++) if (!uniques.Contains(words[j]) ) uniques.Add(words[j]) ; } return uniques; }
private IDictionary GetWordFrequency(string input) { string convertedInput=input.ToLower() ; Tokeniser tokenizer=new Tokeniser() ; String[] words=tokenizer.Partition(convertedInput); Array.Sort(words); String[] distinctWords=GetDistinctWords(words); IDictionary result=new Hashtable(); for (int i=0; i < distinctWords.Length; i++) { object tmp; tmp=CountWords(distinctWords[i], words); result[distinctWords[i]]=tmp; } return result; }