/// <summary> /// Creates /// <see cref="TokenStreamComponents"/> /// used to tokenize all the text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> <see cref="TokenStreamComponents"/> /// built from a <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/> /// , <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <see cref="SnowballFilter"/> </returns> protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable 612, 618 if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { Tokenizer source = new StandardTokenizer(m_matchVersion, reader); TokenStream result = new StandardFilter(m_matchVersion, source); result = new LowerCaseFilter(m_matchVersion, result); result = new StopFilter(m_matchVersion, result, m_stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer()); return(new TokenStreamComponents(source, result)); } else { #pragma warning disable 612, 618 Tokenizer source = new RussianLetterTokenizer(m_matchVersion, reader); #pragma warning restore 612, 618 TokenStream result = new LowerCaseFilter(m_matchVersion, source); result = new StopFilter(m_matchVersion, result, m_stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer()); return(new TokenStreamComponents(source, result)); } }
public virtual void TestRussianLetterTokenizerBWCompat() { StringReader reader = new StringReader("1234567890 Вместе \ud801\udc1ctest"); RussianLetterTokenizer tokenizer = new RussianLetterTokenizer(LuceneVersion.LUCENE_30, reader); AssertTokenStreamContents(tokenizer, new string[] { "1234567890", "Вместе", "test" }); }
public void TestUnicode() { RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT); using (inWords = new StreamReader(@"ru\testUTF8.txt", Encoding.UTF8)) using (sampleUnicode = new StreamReader(@"ru\resUTF8.txt", Encoding.UTF8)) { TokenStream _in = ra.TokenStream("all", inWords); RussianLetterTokenizer sample = new RussianLetterTokenizer( sampleUnicode); ITermAttribute text = _in.GetAttribute<ITermAttribute>(); ITermAttribute sampleText = sample.GetAttribute<ITermAttribute>(); for (; ; ) { if (_in.IncrementToken() == false) break; bool nextSampleToken = sample.IncrementToken(); Assert.AreEqual(text.Term, nextSampleToken == false ? null : sampleText.Term, "Unicode"); } } }
/// <summary> /// Creates a TokenStream which tokenizes all the text in the provided TextReader. /// </summary> /// <param name="fieldName"></param> /// <param name="reader"></param> /// <returns> /// A TokenStream build from a RussianLetterTokenizer filtered with /// RussianLowerCaseFilter, StopFilter, and RussianStemFilter /// </returns> public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new RussianLetterTokenizer(reader, charset); result = new RussianLowerCaseFilter(result, charset); result = new StopFilter(result, stoptable); result = new RussianStemFilter(result, charset); return(result); }
/* * Creates a {@link TokenStream} which tokenizes all the text in the * provided {@link Reader}. * * @return A {@link TokenStream} built from a * {@link RussianLetterTokenizer} filtered with * {@link RussianLowerCaseFilter}, {@link StopFilter}, * and {@link RussianStemFilter} */ public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new RussianLetterTokenizer(reader); result = new LowerCaseFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); result = new RussianStemFilter(result); return(result); }
/* * Creates a {@link TokenStream} which tokenizes all the text in the * provided {@link Reader}. * * @return A {@link TokenStream} built from a * {@link RussianLetterTokenizer} filtered with * {@link RussianLowerCaseFilter}, {@link StopFilter}, * and {@link RussianStemFilter} */ public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new RussianLetterTokenizer(reader); result = new LowerCaseFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); result = new RussianStemFilter(result); return result; }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <seealso cref="SnowballFilter"/> </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer()); return new TokenStreamComponents(source, result); } else { #pragma warning disable 612, 618 Tokenizer source = new RussianLetterTokenizer(matchVersion, reader); #pragma warning restore 612, 618 TokenStream result = new LowerCaseFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer()); return new TokenStreamComponents(source, result); } }
/// <summary> /// Creates a TokenStream which tokenizes all the text in the provided TextReader. /// </summary> /// <param name="fieldName"></param> /// <param name="reader"></param> /// <returns> /// A TokenStream build from a RussianLetterTokenizer filtered with /// RussianLowerCaseFilter, StopFilter, and RussianStemFilter /// </returns> public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new RussianLetterTokenizer(reader, charset); result = new RussianLowerCaseFilter(result, charset); result = new StopFilter(result, stoptable); result = new RussianStemFilter(result, charset); return result; }
public override TokenStream TokenStream(string fieldName, TextReader reader) { var dictionaryReader = new DictionaryReader(); var inMemoryWordsDictionary = new InMemoryWordsDictionary(); dictionaryReader.ProcessDictionary(_directoryLocation,new List<IDictionaryProcessor>(){inMemoryWordsDictionary}); TokenStream result = new RussianLetterTokenizer(reader); result = new LowerCaseFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); result = new OpenCorporaRussianStemFilter(result, inMemoryWordsDictionary); return result; }