public override void SetUp() { base.SetUp(); /** * ICUTokenizer+CJKBigramFilter */ analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false, true)); TokenStream result = new CJKBigramFilter(source); return(new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET))); }); /** * ICUTokenizer+ICUNormalizer2Filter+CJKBigramFilter. * * ICUNormalizer2Filter uses nfkc_casefold by default, so this is a language-independent * superset of CJKWidthFilter's foldings. */ analyzer2 = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false, true)); // we put this before the CJKBigramFilter, because the normalization might combine // some halfwidth katakana forms, which will affect the bigramming. TokenStream result = new ICUNormalizer2Filter(source); result = new CJKBigramFilter(source); return(new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET))); }); }
public override void SetUp() { base.SetUp(); a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false, true)); TokenFilter filter = new ICUNormalizer2Filter(tokenizer); return(new TokenStreamComponents(tokenizer, filter)); }); }