Пример #1
0
        public List <Word> NormalizeText(string text)
        {
            //дополнить список разделителей
            char[] delimiterChars = { ' ', ',', '.', ':', ';', '\t', '(', ')', '{', '}', '"', '–', '\n' };
            //дополнить список слов

            List <string> words = new List <string>(text.Split(delimiterChars));

            foreach (string str in StopWords)
            {
                words.RemoveAll(cfg => cfg == str);
            }

            string reg = "[0-9]*";

            words = words
                    .Select(x => Regex.Replace(x, reg, ""))
                    .ToList();

            words.RemoveAll(cfg => cfg == "");

            List <Word> wordsList = new List <Word>();

            foreach (string str in words)
            {
                Word word = new Word();
                word.sourceWord  = str;
                word.stemmedWord = Porter.TransformingWord(str);
                wordsList.Add(word);
            }

            return(wordsList);
        }
Пример #2
0
        private Dictionary <string, Word> GetWordsTable(List <Word> words)
        {
            Dictionary <string, Word> Table = new Dictionary <string, Word>();

            for (int i = 0; i < words.Count; i++)
            {
                words[i].stemmedWord = Porter.TransformingWord(words[i].sourceWord);
            }

            foreach (Word word in words)
            {
                if (Table.ContainsKey(word.stemmedWord))
                {
                    Table[word.stemmedWord].count++;
                }
                else
                {
                    Table.Add(word.stemmedWord, new Word {
                        sourceWord = word.sourceWord, stemmedWord = word.stemmedWord, count = 1
                    });
                }
            }

            return(Table);
        }