// Initialize the stop word list. public static void InitializeList(string sStopListPath) { goHashTable = new System.Collections.Hashtable(); TextReader oTR = File.OpenText(sStopListPath); string sText = oTR.ReadToEnd(); oTR.Close(); oTR = null; Regex oRegex = new Regex("([ \\t{}():;. \n])"); sText = sText.ToLower(); String[] words = oRegex.Split(sText); for (int i = 0; i < words.Length; i++) { words[i] = words[i].Trim(); } for (int i = 0; i < words.Length; i++) { MatchCollection mc = oRegex.Matches(words[i]); if (mc.Count <= 0 && words[i].Trim().Length > 0 && !StopWordsList.IsStopWord(words[i])) { goHashTable.Add(words[i], ""); } } }
// Get the words from the string. Optional use of stopwords public string[] GetWords(string sText, bool bUseStopWords) { // Tokenize the string Regex oRegex = new Regex("([ \\t{}():;. \n])"); sText = sText.ToLower(); String[] words = oRegex.Split(sText); ArrayList oArraylist = new ArrayList(); for (int i = 0; i < words.Length; i++) { MatchCollection mc = oRegex.Matches(words[i]); if (mc.Count <= 0 && words[i].Trim().Length > 0) { if (bUseStopWords) { if (!StopWordsList.IsStopWord(words[i])) { oArraylist.Add(words[i]); } } else { oArraylist.Add(words[i]); } } } // Cleaning up the extra characters after tokenization. char[] bothsidestrimchar = { '\'', '<', '>', '/', ':', ';', '"', '{', '}', '|', '\\', '[', ']', '.', ',', '~', '`', '!', '?', '@', '#', '%', '^', '&', '*', '(', ')', '_', '-', '+', '=' }; char[] endtrimchar = { '$' }; for (int i = 0; i < oArraylist.Count; i++) { string sObj = (oArraylist[i] as string); sObj = sObj.Trim(); sObj = sObj.Trim(bothsidestrimchar); sObj = sObj.TrimEnd(endtrimchar); oArraylist[i] = sObj; } int arr_cnt = 0; for (int i = 0; i < oArraylist.Count; i++) { if (((string)oArraylist[i]).Trim().Length > 0) { arr_cnt++; } } string[] oArray = new string[arr_cnt]; for (int i = 0, j = 0; i < oArraylist.Count; i++) { if (((string)oArraylist[i]).Trim().Length > 0) { oArray[j] = (string)oArraylist[i]; j++; } } return(oArray); }
// List of initializations. public void Init() { StopWordsList.InitializeList(this.LSIAppPath + @"\StopList\StopList.txt"); _lsiConfig = new LSIConfiguration(); }