public void Removes_Stopswords() { var text = "A quick brown fox jumps over the lazy dog"; var filter = new StopWordsFilter(new DefaultStopWordProvider()); var result = filter.Execute(text).Trim(); Assert.Equal("quick brown fox jumps lazy dog", result); }
public void Removes_Articles() { var text = "Article A Article The"; var filter = new StopWordsFilter(new DefaultStopWordProvider()); var result = filter.Execute(text).Trim(); Assert.Equal("Article Article", result); }
public void StopWordsFilter_FilterAllStopWords_AndMakeLowerCase() { var filter = new StopWordsFilter(new StopWords()); var input = new List <string> { "Abc", "of", "cba", "IN", "the", "car" }; filter.Filter(input).Should().BeEquivalentTo(new List <string> { "abc", "cba", "car" }); }
public void DeleteStopWord_WorksCorrectly() { var stopwords = new StopWords(); var filter = new StopWordsFilter(stopwords); var input = new List <string> { "Abc", "of", "cba", "IN", "the", "car" }; stopwords.Remove("iN"); filter.Filter(input).Should().BeEquivalentTo(new List <string> { "abc", "in", "cba", "car" }); }
public void SetUp() { stopWordsFilter = new StopWordsFilter(new HashSet <string>(stopWords), normalizer); }
public static IEnumerable <string> GetCanonizedTextWords(string text) { var rusStemmer = new RussianStemmer(); var enStemmer = new EnglishStemmer(); var reg1 = new Regex(@"[\s\p{P}№^\|<>`~$]"); var words = reg1.Split(text.ToLower()).Where(s => s != string.Empty && !StopWordsFilter.Contains(s)); foreach (var word in words) { if (IsNumbers(word)) { yield return(word); } else if (IsRussian(word)) { yield return(rusStemmer.Stem(word)); } else { yield return(enStemmer.Stem(word)); } } }