public virtual void TestRandomStrings() { Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new PatternTokenizer(reader, new Regex("a", RegexOptions.Compiled), -1); return(new TokenStreamComponents(tokenizer)); }); CheckRandomData(Random, a, 1000 * RandomMultiplier); Analyzer b = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new PatternTokenizer(reader, new Regex("a", RegexOptions.Compiled), 0); return(new TokenStreamComponents(tokenizer)); }); CheckRandomData(Random, b, 1000 * RandomMultiplier); }
public virtual void TestOffsetCorrection() { const string INPUT = "Günther Günther is here"; // create MappingCharFilter IList<string> mappingRules = new List<string>(); mappingRules.Add("\"ü\" => \"ü\""); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("ü", "ü"); NormalizeCharMap normMap = builder.Build(); CharFilter charStream = new MappingCharFilter(normMap, new StringReader(INPUT)); // create PatternTokenizer TokenStream stream = new PatternTokenizer(charStream, new Regex("[,;/\\s]+", RegexOptions.Compiled), -1); AssertTokenStreamContents(stream, new string[] { "Günther", "Günther", "is", "here" }, new int[] { 0, 13, 26, 29 }, new int[] { 12, 25, 28, 33 }, INPUT.Length); charStream = new MappingCharFilter(normMap, new StringReader(INPUT)); stream = new PatternTokenizer(charStream, new Regex("Günther", RegexOptions.Compiled), 0); AssertTokenStreamContents(stream, new string[] { "Günther", "Günther" }, new int[] { 0, 13 }, new int[] { 12, 25 }, INPUT.Length); }
public virtual void TestOffsetCorrection() { const string INPUT = "Günther Günther is here"; // create MappingCharFilter IList <string> mappingRules = new JCG.List <string>(); mappingRules.Add("\"ü\" => \"ü\""); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.Add("ü", "ü"); NormalizeCharMap normMap = builder.Build(); CharFilter charStream = new MappingCharFilter(normMap, new StringReader(INPUT)); // create PatternTokenizer TokenStream stream = new PatternTokenizer(charStream, new Regex("[,;/\\s]+", RegexOptions.Compiled), -1); AssertTokenStreamContents(stream, new string[] { "Günther", "Günther", "is", "here" }, new int[] { 0, 13, 26, 29 }, new int[] { 12, 25, 28, 33 }, INPUT.Length); charStream = new MappingCharFilter(normMap, new StringReader(INPUT)); stream = new PatternTokenizer(charStream, new Regex("Günther", RegexOptions.Compiled), 0); AssertTokenStreamContents(stream, new string[] { "Günther", "Günther" }, new int[] { 0, 13 }, new int[] { 12, 25 }, INPUT.Length); }
public virtual void TestSplitting() { string qpattern = "\\'([^\\']+)\\'"; // get stuff between "'" string[][] tests = new string[][] { new string[] { "-1", "--", "aaa--bbb--ccc", "aaa bbb ccc" }, new string[] { "-1", ":", "aaa:bbb:ccc", "aaa bbb ccc" }, //new string[] {"-1", "\\p{Space}", "aaa bbb \t\tccc ", "aaa bbb ccc"}, // LUCENENET: Java-specific Regex syntax. See: http://stackoverflow.com/a/4731164/181087 new string[] { "-1", "\\s", "aaa bbb \t\tccc ", "aaa bbb ccc" }, // LUCENENET: This is the .NET equivalent new string[] { "-1", ":", "boo:and:foo", "boo and foo" }, new string[] { "-1", "o", "boo:and:foo", "b :and:f" }, new string[] { "0", ":", "boo:and:foo", ": :" }, new string[] { "0", qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" }, new string[] { "1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc" } }; foreach (string[] test in tests) { TokenStream stream = new PatternTokenizer(new StringReader(test[2]), new Regex(test[1], RegexOptions.Compiled), int.Parse(test[0], CultureInfo.InvariantCulture)); string @out = tsToString(stream); // System.out.println( test[2] + " ==> " + out ); assertEquals("pattern: " + test[1] + " with input: " + test[2], test[3], @out); // Make sure it is the same as if we called 'split' // test disabled, as we remove empty tokens /*if( "-1".equals( test[0] ) ) { * String[] split = test[2].split( test[1] ); * stream = tokenizer.create( new StringReader( test[2] ) ); * int i=0; * for( Token t = stream.next(); null != t; t = stream.next() ) * { * assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) ); * } * }*/ } }
public virtual void TestSplitting() { string qpattern = "\\'([^\\']+)\\'"; // get stuff between "'" string[][] tests = new string[][] { new string[] {"-1", "--", "aaa--bbb--ccc", "aaa bbb ccc"}, new string[] {"-1", ":", "aaa:bbb:ccc", "aaa bbb ccc"}, //new string[] {"-1", "\\p{Space}", "aaa bbb \t\tccc ", "aaa bbb ccc"}, // LUCENENET: Java-specific Regex syntax. See: http://stackoverflow.com/a/4731164/181087 new string[] {"-1", "\\s", "aaa bbb \t\tccc ", "aaa bbb ccc"}, // LUCENENET: This is the .NET equivalent new string[] {"-1", ":", "boo:and:foo", "boo and foo"}, new string[] {"-1", "o", "boo:and:foo", "b :and:f"}, new string[] {"0", ":", "boo:and:foo", ": :"}, new string[] {"0", qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'"}, new string[] {"1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc"} }; foreach (string[] test in tests) { TokenStream stream = new PatternTokenizer(new StringReader(test[2]), new Regex(test[1], RegexOptions.Compiled), int.Parse(test[0])); string @out = tsToString(stream); // System.out.println( test[2] + " ==> " + out ); assertEquals("pattern: " + test[1] + " with input: " + test[2], test[3], @out); // Make sure it is the same as if we called 'split' // test disabled, as we remove empty tokens /*if( "-1".equals( test[0] ) ) { String[] split = test[2].split( test[1] ); stream = tokenizer.create( new StringReader( test[2] ) ); int i=0; for( Token t = stream.next(); null != t; t = stream.next() ) { assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) ); } }*/ } }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new PatternTokenizer(reader, new Regex("a", RegexOptions.Compiled), 0); return(new TokenStreamComponents(tokenizer)); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new PatternTokenizer(reader, new Regex("a", RegexOptions.Compiled), -1); return(new Analyzer.TokenStreamComponents(tokenizer)); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new PatternTokenizer(reader, new Regex("a", RegexOptions.Compiled), 0); return new Analyzer.TokenStreamComponents(tokenizer); }