This tokenizer uses regex pattern matching to construct distinct tokens for the input stream. It takes two arguments: "pattern" and "group".

  • "pattern" is the regular expression.
  • "group" says which group to extract into tokens.

group=-1 (the default) is equivalent to "split". In this case, the tokens will be equivalent to the output from (without empty tokens): String#split(java.lang.String)

Using group >= 0 selects the matching group as the token. For example, if you have:

 pattern = \'([^\']+)\' group = 0 input = aaa 'bbb' 'ccc' 
the output will be two tokens: 'bbb' and 'ccc' (including the ' marks). With the same input but using group=1, the output would be: bbb and ccc (no ' marks)

NOTE: This Tokenizer does not output tokens that are of zero length.

Inheritance: Tokenizer
Esempio n. 1
0
        public virtual void TestRandomStrings()
        {
            Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer = new PatternTokenizer(reader, new Regex("a", RegexOptions.Compiled), -1);
                return(new TokenStreamComponents(tokenizer));
            });

            CheckRandomData(Random, a, 1000 * RandomMultiplier);

            Analyzer b = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer = new PatternTokenizer(reader, new Regex("a", RegexOptions.Compiled), 0);
                return(new TokenStreamComponents(tokenizer));
            });

            CheckRandomData(Random, b, 1000 * RandomMultiplier);
        }
        public virtual void TestOffsetCorrection()
        {
            const string INPUT = "Günther Günther is here";

            // create MappingCharFilter
            IList<string> mappingRules = new List<string>();
            mappingRules.Add("\"&uuml;\" => \"ü\"");
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("&uuml;", "ü");
            NormalizeCharMap normMap = builder.Build();
            CharFilter charStream = new MappingCharFilter(normMap, new StringReader(INPUT));

            // create PatternTokenizer
            TokenStream stream = new PatternTokenizer(charStream, new Regex("[,;/\\s]+", RegexOptions.Compiled), -1);
            AssertTokenStreamContents(stream, new string[] { "Günther", "Günther", "is", "here" }, new int[] { 0, 13, 26, 29 }, new int[] { 12, 25, 28, 33 }, INPUT.Length);

            charStream = new MappingCharFilter(normMap, new StringReader(INPUT));
            stream = new PatternTokenizer(charStream, new Regex("Günther", RegexOptions.Compiled), 0);
            AssertTokenStreamContents(stream, new string[] { "Günther", "Günther" }, new int[] { 0, 13 }, new int[] { 12, 25 }, INPUT.Length);
        }
Esempio n. 3
0
        public virtual void TestOffsetCorrection()
        {
            const string INPUT = "G&uuml;nther G&uuml;nther is here";

            // create MappingCharFilter
            IList <string> mappingRules = new JCG.List <string>();

            mappingRules.Add("\"&uuml;\" => \"ü\"");
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("&uuml;", "ü");
            NormalizeCharMap normMap    = builder.Build();
            CharFilter       charStream = new MappingCharFilter(normMap, new StringReader(INPUT));

            // create PatternTokenizer
            TokenStream stream = new PatternTokenizer(charStream, new Regex("[,;/\\s]+", RegexOptions.Compiled), -1);

            AssertTokenStreamContents(stream, new string[] { "Günther", "Günther", "is", "here" }, new int[] { 0, 13, 26, 29 }, new int[] { 12, 25, 28, 33 }, INPUT.Length);

            charStream = new MappingCharFilter(normMap, new StringReader(INPUT));
            stream     = new PatternTokenizer(charStream, new Regex("Günther", RegexOptions.Compiled), 0);
            AssertTokenStreamContents(stream, new string[] { "Günther", "Günther" }, new int[] { 0, 13 }, new int[] { 12, 25 }, INPUT.Length);
        }
Esempio n. 4
0
        public virtual void TestSplitting()
        {
            string qpattern = "\\'([^\\']+)\\'"; // get stuff between "'"

            string[][] tests = new string[][]
            {
                new string[] { "-1", "--", "aaa--bbb--ccc", "aaa bbb ccc" },
                new string[] { "-1", ":", "aaa:bbb:ccc", "aaa bbb ccc" },
                //new string[] {"-1", "\\p{Space}", "aaa   bbb \t\tccc  ", "aaa bbb ccc"}, // LUCENENET: Java-specific Regex syntax. See: http://stackoverflow.com/a/4731164/181087
                new string[] { "-1", "\\s", "aaa   bbb \t\tccc  ", "aaa bbb ccc" }, // LUCENENET: This is the .NET equivalent
                new string[] { "-1", ":", "boo:and:foo", "boo and foo" },
                new string[] { "-1", "o", "boo:and:foo", "b :and:f" },
                new string[] { "0", ":", "boo:and:foo", ": :" },
                new string[] { "0", qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" },
                new string[] { "1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc" }
            };

            foreach (string[] test in tests)
            {
                TokenStream stream = new PatternTokenizer(new StringReader(test[2]), new Regex(test[1], RegexOptions.Compiled), int.Parse(test[0], CultureInfo.InvariantCulture));
                string      @out   = tsToString(stream);
                // System.out.println( test[2] + " ==> " + out );

                assertEquals("pattern: " + test[1] + " with input: " + test[2], test[3], @out);

                // Make sure it is the same as if we called 'split'
                // test disabled, as we remove empty tokens

                /*if( "-1".equals( test[0] ) ) {
                 * String[] split = test[2].split( test[1] );
                 * stream = tokenizer.create( new StringReader( test[2] ) );
                 * int i=0;
                 * for( Token t = stream.next(); null != t; t = stream.next() )
                 * {
                 *  assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) );
                 * }
                 * }*/
            }
        }
        public virtual void TestSplitting()
        {
            string qpattern = "\\'([^\\']+)\\'"; // get stuff between "'"
            string[][] tests = new string[][]
            {
            new string[] {"-1", "--", "aaa--bbb--ccc", "aaa bbb ccc"},
            new string[] {"-1", ":", "aaa:bbb:ccc", "aaa bbb ccc"},
            //new string[] {"-1", "\\p{Space}", "aaa   bbb \t\tccc  ", "aaa bbb ccc"}, // LUCENENET: Java-specific Regex syntax. See: http://stackoverflow.com/a/4731164/181087
            new string[] {"-1", "\\s", "aaa   bbb \t\tccc  ", "aaa bbb ccc"}, // LUCENENET: This is the .NET equivalent
            new string[] {"-1", ":", "boo:and:foo", "boo and foo"},
            new string[] {"-1", "o", "boo:and:foo", "b :and:f"},
            new string[] {"0", ":", "boo:and:foo", ": :"},
            new string[] {"0", qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'"},
            new string[] {"1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc"}
            };

            foreach (string[] test in tests)
            {
                TokenStream stream = new PatternTokenizer(new StringReader(test[2]), new Regex(test[1], RegexOptions.Compiled), int.Parse(test[0]));
                string @out = tsToString(stream);
                // System.out.println( test[2] + " ==> " + out );

                assertEquals("pattern: " + test[1] + " with input: " + test[2], test[3], @out);

                // Make sure it is the same as if we called 'split'
                // test disabled, as we remove empty tokens
                /*if( "-1".equals( test[0] ) ) {
                  String[] split = test[2].split( test[1] );
                  stream = tokenizer.create( new StringReader( test[2] ) );
                  int i=0;
                  for( Token t = stream.next(); null != t; t = stream.next() ) 
                  {
                    assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) );
                  }
                }*/
            }
        }
Esempio n. 6
0
            protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer tokenizer = new PatternTokenizer(reader, new Regex("a", RegexOptions.Compiled), 0);

                return(new TokenStreamComponents(tokenizer));
            }
Esempio n. 7
0
            public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer tokenizer = new PatternTokenizer(reader, new Regex("a", RegexOptions.Compiled), -1);

                return(new Analyzer.TokenStreamComponents(tokenizer));
            }
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer tokenizer = new PatternTokenizer(reader, new Regex("a", RegexOptions.Compiled), 0);
     return new Analyzer.TokenStreamComponents(tokenizer);
 }