/// <summary> /// Creates /// <see cref="Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> <see cref="Analyzer.TokenStreamComponents"/> /// built from a <see cref="StandardTokenizer"/> filtered with /// <see cref="LowerCaseFilter"/>, <see cref="IndicNormalizationFilter"/>, /// <see cref="HindiNormalizationFilter"/>, <see cref="SetKeywordMarkerFilter"/> /// if a stem exclusion set is provided, <see cref="HindiStemFilter"/>, and /// Hindi Stop words </returns> protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source; #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) { source = new StandardTokenizer(m_matchVersion, reader); } else { source = new IndicTokenizer(m_matchVersion, reader); } #pragma warning restore 612, 618 TokenStream result = new LowerCaseFilter(m_matchVersion, source); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new IndicNormalizationFilter(result); result = new HindiNormalizationFilter(result); result = new StopFilter(m_matchVersion, result, m_stopwords); result = new HindiStemFilter(result); return(new TokenStreamComponents(source, result)); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="LowerCaseFilter"/>, <seealso cref="IndicNormalizationFilter"/>, /// <seealso cref="HindiNormalizationFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> /// if a stem exclusion set is provided, <seealso cref="HindiStemFilter"/>, and /// Hindi Stop words </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source; Tokenizer source; if (matchVersion.onOrAfter(Version.LUCENE_36)) { source = new StandardTokenizer(matchVersion, reader); } else { source = new IndicTokenizer(matchVersion, reader); } TokenStream result = new LowerCaseFilter(matchVersion, source); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new IndicNormalizationFilter(result); result = new HindiNormalizationFilter(result); result = new StopFilter(matchVersion, result, stopwords); result = new HindiStemFilter(result); return(new TokenStreamComponents(source, result)); }
private void check(string input, string output) { Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); TokenFilter tf = new IndicNormalizationFilter(tokenizer); AssertTokenStreamContents(tf, new string[] { output }); }