LowerCaseTokenizer performs the function of LetterTokenizer and LowerCaseFilter together. It divides text at non-letters and converts them to lower case. While it is functionally equivalent to the combination of LetterTokenizer and LowerCaseFilter, there is a performance advantage to doing the two tasks at once, hence this (redundant) implementation.

Note: this does a decent job for most European languages, but does a terrible job for some Asian languages, where words are not separated by spaces.

You must specify the required Version compatibility when creating LowerCaseTokenizer:

Пример #1
0
        public virtual void TestLowerCaseTokenizer()
        {
            StringReader       reader    = new StringReader("Tokenizer \ud801\udc1ctest");
            LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, reader);

            AssertTokenStreamContents(tokenizer, new string[] { "tokenizer", "\ud801\udc44test" });
        }
Пример #2
0
        public virtual void TestLowerCaseTokenizerBWCompat()
        {
            StringReader       reader    = new StringReader("Tokenizer \ud801\udc1ctest");
            LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(LuceneVersion.LUCENE_30, reader);

            AssertTokenStreamContents(tokenizer, new string[] { "tokenizer", "test" });
        }
Пример #3
0
        /// <summary>
        /// Creates
        /// <see cref="TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> <see cref="TokenStreamComponents"/>
        ///         built from a <see cref="LowerCaseTokenizer"/> filtered with
        ///         <see cref="StopFilter"/> </returns>
        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer source = new LowerCaseTokenizer(m_matchVersion, reader);

            return(new TokenStreamComponents(source, new StopFilter(m_matchVersion, source, m_stopwords)));
        }
Пример #4
0
        /// <summary>
        /// Creates
        /// <seealso cref="Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="TextReader"/>.
        /// </summary>
        /// <returns> <seealso cref="Analyzer.TokenStreamComponents"/>
        ///         built from a <seealso cref="LowerCaseTokenizer"/> filtered with
        ///         <seealso cref="StopFilter"/> </returns>
        public override Analyzer.TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer source = new LowerCaseTokenizer(matchVersion, reader);

            return(new Analyzer.TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords)));
        }
Пример #5
0
	  /// <summary>
	  /// Creates
	  /// <seealso cref="Analyzer.TokenStreamComponents"/>
	  /// used to tokenize all the text in the provided <seealso cref="TextReader"/>.
	  /// </summary>
	  /// <returns> <seealso cref="Analyzer.TokenStreamComponents"/>
	  ///         built from a <seealso cref="LowerCaseTokenizer"/> filtered with
	  ///         <seealso cref="StopFilter"/> </returns>
	  public override Analyzer.TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
	  {
		Tokenizer source = new LowerCaseTokenizer(matchVersion, reader);
		return new Analyzer.TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
	  }
Пример #6
0
 public virtual void TestLowerCaseTokenizerBWCompat()
 {
     StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
     LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(LuceneVersion.LUCENE_30, reader);
     AssertTokenStreamContents(tokenizer, new string[] { "tokenizer", "test" });
 }
Пример #7
0
 public virtual void TestLowerCaseTokenizer()
 {
     StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
     LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, reader);
     AssertTokenStreamContents(tokenizer, new string[] { "tokenizer", "\ud801\udc44test" });
 }