public override void Run() { try { startingGun.Wait(); long tokenCount = 0; string contents = "英 เบียร์ ビール ເບຍ abc"; for (int i = 0; i < 1000; i++) { //try //{ Tokenizer tokenizer = new ICUTokenizer(new StringReader(contents)); tokenizer.Reset(); while (tokenizer.IncrementToken()) { tokenCount++; } tokenizer.End(); if (Verbose) { SystemConsole.Out.WriteLine(tokenCount); } } } catch (Exception e) when(e.IsException()) { throw RuntimeException.Create(e); } }
public override void SetUp() { base.SetUp(); /** * ICUTokenizer+CJKBigramFilter */ analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false, true)); TokenStream result = new CJKBigramFilter(source); return(new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET))); }); /** * ICUTokenizer+ICUNormalizer2Filter+CJKBigramFilter. * * ICUNormalizer2Filter uses nfkc_casefold by default, so this is a language-independent * superset of CJKWidthFilter's foldings. */ analyzer2 = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false, true)); // we put this before the CJKBigramFilter, because the normalization might combine // some halfwidth katakana forms, which will affect the bigramming. TokenStream result = new ICUNormalizer2Filter(source); result = new CJKBigramFilter(source); return(new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET))); }); }
public override void SetUp() { base.SetUp(); a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false, false)); return(new TokenStreamComponents(tokenizer)); }); }
public void TestHugeDoc() { StringBuilder sb = new StringBuilder(); char[] whitespace = new char[4094]; Arrays.Fill(whitespace, ' '); sb.append(whitespace); sb.append("testing 1234"); string input = sb.toString(); ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false, true)); AssertTokenStreamContents(tokenizer, new string[] { "testing", "1234" }); }
public void TestHugeTerm2() { StringBuilder sb = new StringBuilder(); for (int i = 0; i < 40960; i++) { sb.append('a'); } string input = sb.toString(); ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false, true)); char[] token = new char[4096]; Arrays.Fill(token, 'a'); string expectedToken = new string(token); string[] expected = { expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken, expectedToken }; AssertTokenStreamContents(tokenizer, expected); }