protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filter = new FakeStandardTokenizer(tokenizer); filter = new StopFilter(TEST_VERSION_CURRENT, filter, CharArraySet.EMPTY_SET); filter = new CJKBigramFilter(filter); return(new TokenStreamComponents(tokenizer, filter)); }
public virtual void TestSingleChar2() { Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filter = new FakeStandardTokenizer(tokenizer); filter = new StopFilter(TEST_VERSION_CURRENT, filter, CharArraySet.EMPTY_SET); filter = new CJKBigramFilter(filter); return(new TokenStreamComponents(tokenizer, filter)); }); AssertAnalyzesTo(analyzer, "一", new string[] { "一" }, new int[] { 0 }, new int[] { 1 }, new string[] { "<SINGLE>" }, new int[] { 1 }); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) { Tokenizer source = new StandardTokenizer(matchVersion, reader); // run the widthfilter first before bigramming, it sometimes combines characters. TokenStream result = new CJKWidthFilter(source); result = new LowerCaseFilter(matchVersion, result); result = new CJKBigramFilter(result); return(new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords))); } else { Tokenizer source = new CJKTokenizer(reader); return(new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords))); } }
protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) #pragma warning restore 612, 618 { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); // run the widthfilter first before bigramming, it sometimes combines characters. TokenStream result = new CJKWidthFilter(source); result = new LowerCaseFilter(m_matchVersion, result); result = new CJKBigramFilter(result); return(new TokenStreamComponents(source, new StopFilter(m_matchVersion, result, m_stopwords))); } else { #pragma warning disable 612, 618 Tokenizer source = new CJKTokenizer(reader); #pragma warning restore 612, 618 return(new TokenStreamComponents(source, new StopFilter(m_matchVersion, source, m_stopwords))); } }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) #pragma warning restore 612, 618 { Tokenizer source = new StandardTokenizer(matchVersion, reader); // run the widthfilter first before bigramming, it sometimes combines characters. TokenStream result = new CJKWidthFilter(source); result = new LowerCaseFilter(matchVersion, result); result = new CJKBigramFilter(result); return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)); } else { #pragma warning disable 612, 618 Tokenizer source = new CJKTokenizer(reader); #pragma warning restore 612, 618 return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords)); } }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filter = new FakeStandardTokenizer(tokenizer); filter = new StopFilter(TEST_VERSION_CURRENT, filter, CharArraySet.EMPTY_SET); filter = new CJKBigramFilter(filter); return new TokenStreamComponents(tokenizer, filter); }