protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            var source = new NGramTokenizer(appLuceneVersion, reader, 1, 10);
            var filter = new LowerCaseFilter(appLuceneVersion, source);

            return(new TokenStreamComponents(source, filter));
        }
Пример #2
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   tokenizer   = new NGramTokenizer(matchVersion, reader, 1, 1);
            TokenStream tokenStream = new NGramTokenFilter(matchVersion, tokenizer);

            tokenStream = new StopFilter(matchVersion, tokenStream, stopwordSet);
            return(new TokenStreamComponents(tokenizer, tokenStream));
        }
Пример #3
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable IDISP001,CA2000
            var source = new NGramTokenizer(_matchVersion, reader);
            var filter = new LowerCaseFilter(_matchVersion, source);
#pragma warning restore IDISP001,CA2000
            return(new TokenStreamComponents(source, filter));
        }
Пример #4
0
        public void TestReset()
        {
            NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);

            AssertTokenStreamContents(tokenizer, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5 /* abcde */);
            tokenizer.Reset(new StringReader("abcde"));
            AssertTokenStreamContents(tokenizer, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5 /* abcde */);
        }
Пример #5
0
            protected override TokenStreamComponents CreateComponents(string fieldName, TextReader input)
            {
                if (fieldName == GramField)
                {
                    var tokenizer       = new NGramTokenizer(LuceneVersion.LUCENE_48, input, 1, 2);
                    var lowercaseFilter = new LowerCaseFilter(LuceneVersion.LUCENE_48, tokenizer);
                    return(new TokenStreamComponents(tokenizer, lowercaseFilter));
                }

                return(new TokenStreamComponents(new KeywordTokenizer(input)));
            }
Пример #6
0
        public void TestNgrams()
        {
            NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);

            AssertTokenStreamContents(tokenizer,
                                      new String[] { "a", "b", "c", "d", "e", "ab", "bc", "cd", "de", "abc", "bcd", "cde" },
                                      new int[] { 0, 1, 2, 3, 4, 0, 1, 2, 3, 0, 1, 2 },
                                      new int[] { 1, 2, 3, 4, 5, 2, 3, 4, 5, 3, 4, 5 },
                                      5 /* abcde */
                                      );
        }
        public void NGramTokenizer_ShouldCreateAnInstanceOfThisType_WhenProperArgument()
        {
            // Arrange
            // Act
            NGramTokenizer actual1
                = new NGramTokenizer(
                      new ArrayManager()
                      );
            NGramTokenizer actual2
                = new NGramTokenizer();

            // Assert
            Assert.IsInstanceOf <NGramTokenizer>(actual1);
            Assert.IsInstanceOf <NGramTokenizer>(actual2);
        }
Пример #8
0
        public void TestTokenStream2()
        {
            // '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<'
            String input = "㌰゙5℃№㈱㌘ザゾ";

            CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
                                                             Normalizer2.GetInstance(null, "nfkc_cf", Normalizer2Mode.Compose));

            Tokenizer tokenStream = new NGramTokenizer(TEST_VERSION_CURRENT, reader, 1, 1);

            AssertTokenStreamContents(tokenStream,
                                      new String[] { "ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ" },
                                      new int[] { 0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9 },
                                      new int[] { 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11 },
                                      input.Length
                                      );
        }
        public void Do_ShouldReturnACollectionOfNGrams_WhenProperArguments()
        {
            // Arrange
            // Act
            List <INGram> actual1 = new NGramTokenizer()
                                    .Do(
                ObjectMother.NGramTokenizer_Text1,
                new TokenizationStrategy(),
                new NGramTokenizerRuleSet());
            List <INGram> actual2 = new NGramTokenizer()
                                    .Do(
                ObjectMother.NGramTokenizer_Text1,
                new TokenizationStrategy());
            List <INGram> actual3 = new NGramTokenizer()
                                    .Do(
                ObjectMother.NGramTokenizer_Text1,
                new NGramTokenizerRuleSet());
            List <INGram> actual4 = new NGramTokenizer()
                                    .Do(
                ObjectMother.NGramTokenizer_Text1);

            // Assert
            Assert.IsTrue(
                ObjectMother.AreEqual(
                    ObjectMother.NGramTokenizer_Text1_NGrams,
                    actual1));
            Assert.IsTrue(
                ObjectMother.AreEqual(
                    ObjectMother.NGramTokenizer_Text1_NGrams,
                    actual2));
            Assert.IsTrue(
                ObjectMother.AreEqual(
                    ObjectMother.NGramTokenizer_Text1_NGrams,
                    actual3));
            Assert.IsTrue(
                ObjectMother.AreEqual(
                    ObjectMother.NGramTokenizer_Text1_NGrams,
                    actual4));
        }
Пример #10
0
        public override TokenStream TokenStream(string str, TextReader r)
        {
            var source = new NGramTokenizer(r, 3, 4);

            return(source);
        }
Пример #11
0
        public void TestBigrams()
        {
            NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);

            AssertTokenStreamContents(tokenizer, new String[] { "ab", "bc", "cd", "de" }, new int[] { 0, 1, 2, 3 }, new int[] { 2, 3, 4, 5 }, 5 /* abcde */);
        }
Пример #12
0
        public void TestUnigrams()
        {
            NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);

            AssertTokenStreamContents(tokenizer, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5 /* abcde */);
        }
Пример #13
0
        public void TestOversizedNgrams()
        {
            NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);

            AssertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
        }