Inheritance: Lucene.Net.Analysis.Util.CharTokenizer
Esempio n. 1
0
        /// <summary>
        /// Creates
        /// <see cref="TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> <see cref="TokenStreamComponents"/>
        ///         built from a <see cref="StandardTokenizer"/> filtered with
        ///         <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>
        ///         , <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided, and <see cref="SnowballFilter"/> </returns>
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
                TokenStream result = new StandardFilter(m_matchVersion, source);
                result = new LowerCaseFilter(m_matchVersion, result);
                result = new StopFilter(m_matchVersion, result, m_stopwords);
                if (stemExclusionSet.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, stemExclusionSet);
                }
                result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer());
                return(new TokenStreamComponents(source, result));
            }
            else
            {
#pragma warning disable 612, 618
                Tokenizer source = new RussianLetterTokenizer(m_matchVersion, reader);
#pragma warning restore 612, 618
                TokenStream result = new LowerCaseFilter(m_matchVersion, source);
                result = new StopFilter(m_matchVersion, result, m_stopwords);
                if (stemExclusionSet.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, stemExclusionSet);
                }
                result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer());
                return(new TokenStreamComponents(source, result));
            }
        }
        public virtual void TestRussianLetterTokenizerBWCompat()
        {
            StringReader           reader    = new StringReader("1234567890 Вместе \ud801\udc1ctest");
            RussianLetterTokenizer tokenizer = new RussianLetterTokenizer(LuceneVersion.LUCENE_30, reader);

            AssertTokenStreamContents(tokenizer, new string[] { "1234567890", "Вместе", "test" });
        }
        public void TestUnicode()
        {
            RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT);

            using (inWords = new StreamReader(@"ru\testUTF8.txt", Encoding.UTF8))
            using (sampleUnicode = new StreamReader(@"ru\resUTF8.txt", Encoding.UTF8))
            {

                TokenStream _in = ra.TokenStream("all", inWords);

                RussianLetterTokenizer sample =
                    new RussianLetterTokenizer(
                        sampleUnicode);

                ITermAttribute text = _in.GetAttribute<ITermAttribute>();
                ITermAttribute sampleText = sample.GetAttribute<ITermAttribute>();

                for (; ; )
                {
                    if (_in.IncrementToken() == false)
                        break;

                    bool nextSampleToken = sample.IncrementToken();
                    Assert.AreEqual(text.Term, nextSampleToken == false ? null : sampleText.Term, "Unicode");
                }
            }
        }
Esempio n. 4
0
        /// <summary>
        /// Creates a TokenStream which tokenizes all the text in the provided TextReader.
        /// </summary>
        /// <param name="fieldName"></param>
        /// <param name="reader"></param>
        /// <returns>
        ///		A TokenStream build from a RussianLetterTokenizer filtered with
        ///     RussianLowerCaseFilter, StopFilter, and RussianStemFilter
        ///  </returns>
        public override TokenStream TokenStream(String fieldName, TextReader reader)
        {
            TokenStream result = new RussianLetterTokenizer(reader, charset);

            result = new RussianLowerCaseFilter(result, charset);
            result = new StopFilter(result, stoptable);
            result = new RussianStemFilter(result, charset);
            return(result);
        }
Esempio n. 5
0
        /*
         * Creates a {@link TokenStream} which tokenizes all the text in the
         * provided {@link Reader}.
         *
         * @return  A {@link TokenStream} built from a
         *   {@link RussianLetterTokenizer} filtered with
         *   {@link RussianLowerCaseFilter}, {@link StopFilter},
         *   and {@link RussianStemFilter}
         */
        public override TokenStream TokenStream(String fieldName, TextReader reader)
        {
            TokenStream result = new RussianLetterTokenizer(reader);

            result = new LowerCaseFilter(result);
            result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                    result, stopSet);
            result = new RussianStemFilter(result);
            return(result);
        }
Esempio n. 6
0
 /*
  * Creates a {@link TokenStream} which tokenizes all the text in the 
  * provided {@link Reader}.
  *
  * @return  A {@link TokenStream} built from a 
  *   {@link RussianLetterTokenizer} filtered with 
  *   {@link RussianLowerCaseFilter}, {@link StopFilter}, 
  *   and {@link RussianStemFilter}
  */
 public override TokenStream TokenStream(String fieldName, TextReader reader)
 {
     TokenStream result = new RussianLetterTokenizer(reader);
     result = new LowerCaseFilter(result);
     result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                             result, stopSet);
     result = new RussianStemFilter(result);
     return result;
 }
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from a <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided, and <seealso cref="SnowballFilter"/> </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                Tokenizer source = new StandardTokenizer(matchVersion, reader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new LowerCaseFilter(matchVersion, result);
                result = new StopFilter(matchVersion, result, stopwords);
                if (stemExclusionSet.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, stemExclusionSet);
                }
                result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer());
                return new TokenStreamComponents(source, result);
            }
            else
            {
#pragma warning disable 612, 618
                Tokenizer source = new RussianLetterTokenizer(matchVersion, reader);
#pragma warning restore 612, 618
                TokenStream result = new LowerCaseFilter(matchVersion, source);
                result = new StopFilter(matchVersion, result, stopwords);
                if (stemExclusionSet.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, stemExclusionSet);
                }
                result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer());
                return new TokenStreamComponents(source, result);
            }
        }
Esempio n. 8
0
		/// <summary>
		/// Creates a TokenStream which tokenizes all the text in the provided TextReader.
		/// </summary>
		/// <param name="fieldName"></param>
		/// <param name="reader"></param>
		/// <returns>
		///		A TokenStream build from a RussianLetterTokenizer filtered with
		///     RussianLowerCaseFilter, StopFilter, and RussianStemFilter
		///  </returns>
		public override TokenStream TokenStream(String fieldName, TextReader reader)
		{
			TokenStream result = new RussianLetterTokenizer(reader, charset);
			result = new RussianLowerCaseFilter(result, charset);
			result = new StopFilter(result, stoptable);
			result = new RussianStemFilter(result, charset);
			return result;
		}
Esempio n. 9
0
 public override TokenStream TokenStream(string fieldName, TextReader reader)
 {
     var dictionaryReader = new DictionaryReader();
     var inMemoryWordsDictionary = new InMemoryWordsDictionary();
     dictionaryReader.ProcessDictionary(_directoryLocation,new List<IDictionaryProcessor>(){inMemoryWordsDictionary});
     TokenStream result = new RussianLetterTokenizer(reader);
     result = new LowerCaseFilter(result);
     result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                             result, stopSet);
     result = new OpenCorporaRussianStemFilter(result, inMemoryWordsDictionary);
     return result;
 }
 public virtual void TestRussianLetterTokenizerBWCompat()
 {
     StringReader reader = new StringReader("1234567890 Вместе \ud801\udc1ctest");
     RussianLetterTokenizer tokenizer = new RussianLetterTokenizer(LuceneVersion.LUCENE_30, reader);
     AssertTokenStreamContents(tokenizer, new string[] { "1234567890", "Вместе", "test" });
 }