Forms bigrams of CJK terms that are generated from StandardTokenizer or ICUTokenizer.

CJK types are set by these tokenizers, but you can also use #CJKBigramFilter(TokenStream, int) to explicitly control which of the CJK scripts are turned into bigrams.

By default, when a CJK character has no adjacent characters to form a bigram, it is output in unigram form. If you want to always output both unigrams and bigrams, set the outputUnigrams flag in CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean). This can be used for a combined unigram+bigram approach.

In all cases, all non-CJK input is passed thru unmodified.

Inheritance: TokenFilter
Example #1
0
            protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer   tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
                TokenFilter filter    = new FakeStandardTokenizer(tokenizer);

                filter = new StopFilter(TEST_VERSION_CURRENT, filter, CharArraySet.EMPTY_SET);
                filter = new CJKBigramFilter(filter);
                return(new TokenStreamComponents(tokenizer, filter));
            }
Example #2
0
        public virtual void TestSingleChar2()
        {
            Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
                TokenFilter filter  = new FakeStandardTokenizer(tokenizer);
                filter = new StopFilter(TEST_VERSION_CURRENT, filter, CharArraySet.EMPTY_SET);
                filter = new CJKBigramFilter(filter);
                return(new TokenStreamComponents(tokenizer, filter));
            });

            AssertAnalyzesTo(analyzer, "一", new string[] { "一" }, new int[] { 0 }, new int[] { 1 }, new string[] { "<SINGLE>" }, new int[] { 1 });
        }
Example #3
0
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
     {
         Tokenizer source = new StandardTokenizer(matchVersion, reader);
         // run the widthfilter first before bigramming, it sometimes combines characters.
         TokenStream result = new CJKWidthFilter(source);
         result = new LowerCaseFilter(matchVersion, result);
         result = new CJKBigramFilter(result);
         return(new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)));
     }
     else
     {
         Tokenizer source = new CJKTokenizer(reader);
         return(new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords)));
     }
 }
Example #4
0
        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
#pragma warning restore 612, 618
            {
                Tokenizer source = new StandardTokenizer(m_matchVersion, reader);
                // run the widthfilter first before bigramming, it sometimes combines characters.
                TokenStream result = new CJKWidthFilter(source);
                result = new LowerCaseFilter(m_matchVersion, result);
                result = new CJKBigramFilter(result);
                return(new TokenStreamComponents(source, new StopFilter(m_matchVersion, result, m_stopwords)));
            }
            else
            {
#pragma warning disable 612, 618
                Tokenizer source = new CJKTokenizer(reader);
#pragma warning restore 612, 618
                return(new TokenStreamComponents(source, new StopFilter(m_matchVersion, source, m_stopwords)));
            }
        }
Example #5
0
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
#pragma warning restore 612, 618
            {
                Tokenizer source = new StandardTokenizer(matchVersion, reader);
                // run the widthfilter first before bigramming, it sometimes combines characters.
                TokenStream result = new CJKWidthFilter(source);
                result = new LowerCaseFilter(matchVersion, result);
                result = new CJKBigramFilter(result);
                return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
            }
            else
            {
#pragma warning disable 612, 618
                Tokenizer source = new CJKTokenizer(reader);
#pragma warning restore 612, 618
                return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
            }
        }
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
     TokenFilter filter = new FakeStandardTokenizer(tokenizer);
     filter = new StopFilter(TEST_VERSION_CURRENT, filter, CharArraySet.EMPTY_SET);
     filter = new CJKBigramFilter(filter);
     return new TokenStreamComponents(tokenizer, filter);
 }