Example #1
0
        public void Removes_Stopswords()
        {
            var text = "A quick brown fox jumps over the lazy dog";

            var filter = new StopWordsFilter(new DefaultStopWordProvider());

            var result = filter.Execute(text).Trim();

            Assert.Equal("quick brown fox jumps lazy dog", result);
        }
Example #2
0
        public void Removes_Articles()

        {
            var text = "Article A Article The";

            var filter = new StopWordsFilter(new DefaultStopWordProvider());

            var result = filter.Execute(text).Trim();

            Assert.Equal("Article Article", result);
        }
Example #3
0
        public void StopWordsFilter_FilterAllStopWords_AndMakeLowerCase()
        {
            var filter = new StopWordsFilter(new StopWords());
            var input  = new List <string> {
                "Abc", "of", "cba", "IN", "the", "car"
            };

            filter.Filter(input).Should().BeEquivalentTo(new List <string> {
                "abc", "cba", "car"
            });
        }
Example #4
0
        public void DeleteStopWord_WorksCorrectly()
        {
            var stopwords = new StopWords();
            var filter    = new StopWordsFilter(stopwords);
            var input     = new List <string> {
                "Abc", "of", "cba", "IN", "the", "car"
            };

            stopwords.Remove("iN");
            filter.Filter(input).Should().BeEquivalentTo(new List <string> {
                "abc", "in", "cba", "car"
            });
        }
Example #5
0
 public void SetUp()
 {
     stopWordsFilter = new StopWordsFilter(new HashSet <string>(stopWords), normalizer);
 }
        public static IEnumerable <string> GetCanonizedTextWords(string text)
        {
            var rusStemmer = new RussianStemmer();
            var enStemmer  = new EnglishStemmer();
            var reg1       = new Regex(@"[\s\p{P}№^\|<>`~$]");
            var words      = reg1.Split(text.ToLower()).Where(s => s != string.Empty && !StopWordsFilter.Contains(s));

            foreach (var word in words)
            {
                if (IsNumbers(word))
                {
                    yield return(word);
                }
                else if (IsRussian(word))
                {
                    yield return(rusStemmer.Stem(word));
                }
                else
                {
                    yield return(enStemmer.Stem(word));
                }
            }
        }