A TokenFilter that applies HindiNormalizer to normalize the orthography.

In some cases the normalization may cause unrelated terms to conflate, so to prevent terms from being normalized use an instance of SetKeywordMarkerFilter or a custom TokenFilter that sets the KeywordAttribute before this TokenStream.

상속: TokenFilter
예제 #1
0
        /// <summary>
        /// Creates
        /// <see cref="Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> <see cref="Analyzer.TokenStreamComponents"/>
        ///         built from a <see cref="StandardTokenizer"/> filtered with
        ///         <see cref="LowerCaseFilter"/>, <see cref="IndicNormalizationFilter"/>,
        ///         <see cref="HindiNormalizationFilter"/>, <see cref="SetKeywordMarkerFilter"/>
        ///         if a stem exclusion set is provided, <see cref="HindiStemFilter"/>, and
        ///         Hindi Stop words </returns>
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer source;

#pragma warning disable 612, 618
            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
            {
                source = new StandardTokenizer(m_matchVersion, reader);
            }
            else
            {
                source = new IndicTokenizer(m_matchVersion, reader);
            }
#pragma warning restore 612, 618
            TokenStream result = new LowerCaseFilter(m_matchVersion, source);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new IndicNormalizationFilter(result);
            result = new HindiNormalizationFilter(result);
            result = new StopFilter(m_matchVersion, result, m_stopwords);
            result = new HindiStemFilter(result);
            return(new TokenStreamComponents(source, result));
        }
예제 #2
0
        private void Check(string input, string output)
        {
            Tokenizer   tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
            TokenFilter tf        = new HindiNormalizationFilter(tokenizer);

            AssertTokenStreamContents(tf, new string[] { output });
        }
 private void Check(string input, string output)
 {
     Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
     TokenFilter tf = new HindiNormalizationFilter(tokenizer);
     AssertTokenStreamContents(tf, new string[] { output });
 }