Пример #1
0
        public override void SetUp()
        {
            base.SetUp();

            /**
             * ICUTokenizer+CJKBigramFilter
             */
            analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer source   = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false, true));
                TokenStream result = new CJKBigramFilter(source);
                return(new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET)));
            });

            /**
             * ICUTokenizer+ICUNormalizer2Filter+CJKBigramFilter.
             *
             * ICUNormalizer2Filter uses nfkc_casefold by default, so this is a language-independent
             * superset of CJKWidthFilter's foldings.
             */
            analyzer2 = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false, true));
                // we put this before the CJKBigramFilter, because the normalization might combine
                // some halfwidth katakana forms, which will affect the bigramming.
                TokenStream result = new ICUNormalizer2Filter(source);
                result             = new CJKBigramFilter(source);
                return(new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET)));
            });
        }
Пример #2
0
 public override void SetUp()
 {
     base.SetUp();
     a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
     {
         Tokenizer tokenizer = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false, true));
         TokenFilter filter  = new ICUNormalizer2Filter(tokenizer);
         return(new TokenStreamComponents(tokenizer, filter));
     });
 }