Esempio n. 1
0
        // Initialize the stop word list.
        public static void InitializeList(string sStopListPath)
        {
            goHashTable = new System.Collections.Hashtable();
            TextReader oTR   = File.OpenText(sStopListPath);
            string     sText = oTR.ReadToEnd();

            oTR.Close();
            oTR = null;

            Regex oRegex = new Regex("([ \\t{}():;. \n])");

            sText = sText.ToLower();

            String[] words = oRegex.Split(sText);
            for (int i = 0; i < words.Length; i++)
            {
                words[i] = words[i].Trim();
            }

            for (int i = 0; i < words.Length; i++)
            {
                MatchCollection mc = oRegex.Matches(words[i]);
                if (mc.Count <= 0 && words[i].Trim().Length > 0 &&
                    !StopWordsList.IsStopWord(words[i]))
                {
                    goHashTable.Add(words[i], "");
                }
            }
        }
Esempio n. 2
0
        // Get the words from the string. Optional use of stopwords
        public string[] GetWords(string sText, bool bUseStopWords)
        {
            // Tokenize the string
            Regex oRegex = new Regex("([ \\t{}():;. \n])");

            sText = sText.ToLower();

            String[]  words      = oRegex.Split(sText);
            ArrayList oArraylist = new ArrayList();

            for (int i = 0; i < words.Length; i++)
            {
                MatchCollection mc = oRegex.Matches(words[i]);
                if (mc.Count <= 0 && words[i].Trim().Length > 0)
                {
                    if (bUseStopWords)
                    {
                        if (!StopWordsList.IsStopWord(words[i]))
                        {
                            oArraylist.Add(words[i]);
                        }
                    }
                    else
                    {
                        oArraylist.Add(words[i]);
                    }
                }
            }

            // Cleaning up the extra characters after tokenization.
            char[] bothsidestrimchar = { '\'', '<', '>', '/', ':', ';', '"', '{', '}', '|', '\\', '[', ']', '.', ',', '~', '`', '!', '?', '@', '#', '%', '^', '&', '*', '(', ')', '_', '-', '+', '=' };
            char[] endtrimchar       = { '$' };
            for (int i = 0; i < oArraylist.Count; i++)
            {
                string sObj = (oArraylist[i] as string);
                sObj          = sObj.Trim();
                sObj          = sObj.Trim(bothsidestrimchar);
                sObj          = sObj.TrimEnd(endtrimchar);
                oArraylist[i] = sObj;
            }

            int arr_cnt = 0;

            for (int i = 0; i < oArraylist.Count; i++)
            {
                if (((string)oArraylist[i]).Trim().Length > 0)
                {
                    arr_cnt++;
                }
            }
            string[] oArray = new string[arr_cnt];
            for (int i = 0, j = 0; i < oArraylist.Count; i++)
            {
                if (((string)oArraylist[i]).Trim().Length > 0)
                {
                    oArray[j] = (string)oArraylist[i];
                    j++;
                }
            }
            return(oArray);
        }
Esempio n. 3
0
 // List of initializations.
 public void Init()
 {
     StopWordsList.InitializeList(this.LSIAppPath + @"\StopList\StopList.txt");
     _lsiConfig = new LSIConfiguration();
 }