A LetterTokenizer is a tokenizer that divides text at non-letters. That's to say, it defines tokens as maximal strings of adjacent letters, as defined by java.lang.Character.isLetter() predicate.

Note: this does a decent job for most European languages, but does a terrible job for some Asian languages, where words are not separated by spaces.

You must specify the required LuceneVersion compatibility when creating LetterTokenizer:

Ejemplo n.º 1
0
        public virtual void TestLetterAscii()
        {
            Random   random = Random;
            Analyzer left   = new MockAnalyzer(random, jvmLetter, false);
            Analyzer right  = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader);
                return(new TokenStreamComponents(tokenizer, tokenizer));
            });

            for (int i = 0; i < 1000; i++)
            {
                string s = TestUtil.RandomSimpleString(random);
                assertEquals(s, left.GetTokenStream("foo", newStringReader(s)), right.GetTokenStream("foo", newStringReader(s)));
            }
        }
Ejemplo n.º 2
0
        public virtual void TestLetterAsciiHuge()
        {
            Random       random    = Random;
            int          maxLength = 8192; // CharTokenizer.IO_BUFFER_SIZE*2
            MockAnalyzer left      = new MockAnalyzer(random, jvmLetter, false);

            left.MaxTokenLength = 255; // match CharTokenizer's max token length
            Analyzer right = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader);
                return(new TokenStreamComponents(tokenizer, tokenizer));
            });
            int numIterations = AtLeast(50);

            for (int i = 0; i < numIterations; i++)
            {
                string s = TestUtil.RandomSimpleString(random, maxLength);
                assertEquals(s, left.GetTokenStream("foo", newStringReader(s)), right.GetTokenStream("foo", newStringReader(s)));
            }
        }
            protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader);

                return(new TokenStreamComponents(tokenizer, tokenizer));
            }
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader);
     return new TokenStreamComponents(tokenizer, tokenizer);
 }
 public virtual void TestGraphs()
 {
     TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q"));
     tk = new ShingleFilter(tk);
     tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10);
     AssertTokenStreamContents(tk, new string[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" }, new int[] { 6, 11, 11, 14 }, new int[] { 13, 19, 19, 21 }, new int[] { 3, 1, 0, 1 }, new int[] { 2, 2, 2, 2 }, 23);
 }