protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { var source = new NGramTokenizer(appLuceneVersion, reader, 1, 10); var filter = new LowerCaseFilter(appLuceneVersion, source); return(new TokenStreamComponents(source, filter)); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new NGramTokenizer(matchVersion, reader, 1, 1); TokenStream tokenStream = new NGramTokenFilter(matchVersion, tokenizer); tokenStream = new StopFilter(matchVersion, tokenStream, stopwordSet); return(new TokenStreamComponents(tokenizer, tokenStream)); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable IDISP001,CA2000 var source = new NGramTokenizer(_matchVersion, reader); var filter = new LowerCaseFilter(_matchVersion, source); #pragma warning restore IDISP001,CA2000 return(new TokenStreamComponents(source, filter)); }
public void TestReset() { NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1); AssertTokenStreamContents(tokenizer, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5 /* abcde */); tokenizer.Reset(new StringReader("abcde")); AssertTokenStreamContents(tokenizer, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5 /* abcde */); }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader input) { if (fieldName == GramField) { var tokenizer = new NGramTokenizer(LuceneVersion.LUCENE_48, input, 1, 2); var lowercaseFilter = new LowerCaseFilter(LuceneVersion.LUCENE_48, tokenizer); return(new TokenStreamComponents(tokenizer, lowercaseFilter)); } return(new TokenStreamComponents(new KeywordTokenizer(input))); }
public void TestNgrams() { NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3); AssertTokenStreamContents(tokenizer, new String[] { "a", "b", "c", "d", "e", "ab", "bc", "cd", "de", "abc", "bcd", "cde" }, new int[] { 0, 1, 2, 3, 4, 0, 1, 2, 3, 0, 1, 2 }, new int[] { 1, 2, 3, 4, 5, 2, 3, 4, 5, 3, 4, 5 }, 5 /* abcde */ ); }
public void NGramTokenizer_ShouldCreateAnInstanceOfThisType_WhenProperArgument() { // Arrange // Act NGramTokenizer actual1 = new NGramTokenizer( new ArrayManager() ); NGramTokenizer actual2 = new NGramTokenizer(); // Assert Assert.IsInstanceOf <NGramTokenizer>(actual1); Assert.IsInstanceOf <NGramTokenizer>(actual2); }
public void TestTokenStream2() { // '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<' String input = "㌰゙5℃№㈱㌘ザゾ"; CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), Normalizer2.GetInstance(null, "nfkc_cf", Normalizer2Mode.Compose)); Tokenizer tokenStream = new NGramTokenizer(TEST_VERSION_CURRENT, reader, 1, 1); AssertTokenStreamContents(tokenStream, new String[] { "ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ" }, new int[] { 0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9 }, new int[] { 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11 }, input.Length ); }
public void Do_ShouldReturnACollectionOfNGrams_WhenProperArguments() { // Arrange // Act List <INGram> actual1 = new NGramTokenizer() .Do( ObjectMother.NGramTokenizer_Text1, new TokenizationStrategy(), new NGramTokenizerRuleSet()); List <INGram> actual2 = new NGramTokenizer() .Do( ObjectMother.NGramTokenizer_Text1, new TokenizationStrategy()); List <INGram> actual3 = new NGramTokenizer() .Do( ObjectMother.NGramTokenizer_Text1, new NGramTokenizerRuleSet()); List <INGram> actual4 = new NGramTokenizer() .Do( ObjectMother.NGramTokenizer_Text1); // Assert Assert.IsTrue( ObjectMother.AreEqual( ObjectMother.NGramTokenizer_Text1_NGrams, actual1)); Assert.IsTrue( ObjectMother.AreEqual( ObjectMother.NGramTokenizer_Text1_NGrams, actual2)); Assert.IsTrue( ObjectMother.AreEqual( ObjectMother.NGramTokenizer_Text1_NGrams, actual3)); Assert.IsTrue( ObjectMother.AreEqual( ObjectMother.NGramTokenizer_Text1_NGrams, actual4)); }
public override TokenStream TokenStream(string str, TextReader r) { var source = new NGramTokenizer(r, 3, 4); return(source); }
public void TestBigrams() { NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2); AssertTokenStreamContents(tokenizer, new String[] { "ab", "bc", "cd", "de" }, new int[] { 0, 1, 2, 3 }, new int[] { 2, 3, 4, 5 }, 5 /* abcde */); }
public void TestUnigrams() { NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1); AssertTokenStreamContents(tokenizer, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5 /* abcde */); }
public void TestOversizedNgrams() { NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7); AssertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */); }