Пример #1
0
        public HashSet <string> Process(IDocumentReader reader)
        {
            var res = new HashSet <string>();

            while (!reader.EndOfFile())
            {
                var line = reader.ReadLine().ToLower();
                if (line != string.Empty)
                {
                    //split line into sentences
                    var sent = textProcessor.GetSentences(line);
                    foreach (var s in sent)
                    {
                        //tokenize
                        var toks = textProcessor.Tokenize(s);
                        foreach (var t in toks)
                        {
                            //add full word
                            if (!res.Contains(t) && !stopwords.Exists(t))
                            {
                                res.Add(t);
                            }
                            //add stemmed word
                            var st = textProcessor.Stem(t);
                            if (!res.Contains(st) && !stopwords.Exists(st))
                            {
                                res.Add(st);
                            }
                        }
                    }
                }
            }
            return(res);
        }
Пример #2
0
        public HashSet <string> Process(IDocumentReader reader)
        {
            var res = new HashSet <string>();

            while (!reader.EndOfFile())
            {
                var word = reader.ReadLine();
                if (!res.Contains(word))
                {
                    res.Add(word.ToLower());
                }
            }
            return(res);
        }