コード例 #1
0
        /// <summary>
        /// Creates a
        /// <see cref="TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> A
        ///         <see cref="TokenStreamComponents"/>
        ///         built from an <see cref="StandardTokenizer"/> filtered with
        ///         <see cref="StandardFilter"/>, <see cref="ElisionFilter"/>, <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>,
        ///         <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <see cref="ItalianLightStemFilter"/>. </returns>
        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
            TokenStream result = new StandardFilter(m_matchVersion, source);

#pragma warning disable 612, 618
            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_32))
#pragma warning restore 612, 618
            {
                result = new ElisionFilter(result, DEFAULT_ARTICLES);
            }
            result = new LowerCaseFilter(m_matchVersion, result);
            result = new StopFilter(m_matchVersion, result, m_stopwords);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
#pragma warning disable 612, 618
            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
#pragma warning restore 612, 618
            {
                result = new ItalianLightStemFilter(result);
            }
            else
            {
                result = new SnowballFilter(result, new ItalianStemmer());
            }
            return(new TokenStreamComponents(source, result));
        }
コード例 #2
0
ファイル: ItalianAnalyzer.cs プロジェクト: zfxsss/lucenenet
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="ItalianLightStemFilter"/>. </returns>
        protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);

            if (matchVersion.onOrAfter(Version.LUCENE_32))
            {
                result = new ElisionFilter(result, DEFAULT_ARTICLES);
            }
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (!stemExclusionSet.Empty)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            if (matchVersion.onOrAfter(Version.LUCENE_36))
            {
                result = new ItalianLightStemFilter(result);
            }
            else
            {
                result = new SnowballFilter(result, new ItalianStemmer());
            }
            return(new TokenStreamComponents(source, result));
        }
コード例 #3
0
 public virtual void TestElision_()
 {
     string test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
     Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(test));
     CharArraySet articles = new CharArraySet(TEST_VERSION_CURRENT, AsSet("l", "M"), false);
     TokenFilter filter = new ElisionFilter(tokenizer, articles);
     IList<string> tas = Filter(filter);
     assertEquals("embrouille", tas[4]);
     assertEquals("O'brian", tas[6]);
     assertEquals("enfin", tas[7]);
 }
コード例 #4
0
            protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer   source = new StandardTokenizer(Lucene.Net.Util.LuceneVersion.LUCENE_48, reader);
                TokenStream result = new StandardFilter(Lucene.Net.Util.LuceneVersion.LUCENE_48, source);

                result = new ElisionFilter(result, _defaultArticles);
                result = new LowerCaseFilter(Lucene.Net.Util.LuceneVersion.LUCENE_48, result);
                result = new StopFilter(Lucene.Net.Util.LuceneVersion.LUCENE_48, result, _defaultStopwords);
                result = new PermutermTokenFilter(result);
                return(new TokenStreamComponents(source, result));
            }
コード例 #5
0
ファイル: TestElision.cs プロジェクト: zfxsss/lucenenet
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testElision() throws Exception
        public virtual void testElision()
        {
            string         test      = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
            Tokenizer      tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(test));
            CharArraySet   articles  = new CharArraySet(TEST_VERSION_CURRENT, asSet("l", "M"), false);
            TokenFilter    filter    = new ElisionFilter(tokenizer, articles);
            IList <string> tas       = filter(filter);

            assertEquals("embrouille", tas[4]);
            assertEquals("O'brian", tas[6]);
            assertEquals("enfin", tas[7]);
        }
コード例 #6
0
        public void TestElision2()
        {
            String           test      = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
            Tokenizer        tokenizer = new StandardTokenizer(Version.LUCENE_CURRENT, new StringReader(test));
            HashSet <String> articles  = new HashSet <String>();

            articles.Add("l");
            articles.Add("M");
            TokenFilter   filter = new ElisionFilter(tokenizer, articles);
            List <string> tas    = Filtre(filter);

            Assert.AreEqual("embrouille", tas[4]);
            Assert.AreEqual("O'brian", tas[6]);
            Assert.AreEqual("enfin", tas[7]);
        }
コード例 #7
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>,
        ///         <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="SnowballFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);

            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
            {
                result = new ElisionFilter(result, DEFAULT_ARTICLES);
            }
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new SnowballFilter(result, new CatalanStemmer());
            return(new TokenStreamComponents(source, result));
        }
コード例 #8
0
        /// <summary>
        /// Creates
        /// <see cref="TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> <see cref="TokenStreamComponents"/>
        ///         built from a <see cref="StandardTokenizer"/> filtered with
        ///         <see cref="StandardFilter"/>, <see cref="ElisionFilter"/>,
        ///         <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>,
        ///         <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided, and <see cref="FrenchLightStemFilter"/> </returns>
        ///
        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
                TokenStream result = new StandardFilter(m_matchVersion, source);
                result = new ElisionFilter(result, DEFAULT_ARTICLES);
                result = new LowerCaseFilter(m_matchVersion, result);
                result = new StopFilter(m_matchVersion, result, m_stopwords);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
#pragma warning disable 612, 618
                if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
#pragma warning restore 612, 618
                {
                    result = new FrenchLightStemFilter(result);
                }
                else
                {
                    result = new SnowballFilter(result, new Tartarus.Snowball.Ext.FrenchStemmer());
                }
                return(new TokenStreamComponents(source, result));
            }
            else
            {
                Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
                TokenStream result = new StandardFilter(m_matchVersion, source);
                result = new StopFilter(m_matchVersion, result, m_stopwords);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
#pragma warning disable 612, 618
                result = new FrenchStemFilter(result);
#pragma warning restore 612, 618
                // Convert to lowercase after stemming!
                return(new TokenStreamComponents(source, new LowerCaseFilter(m_matchVersion, result)));
            }
        }
コード例 #9
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="IrishLowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="SnowballFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);
            StopFilter  s      = new StopFilter(matchVersion, result, HYPHENATIONS);

            if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_44))
            {
                s.EnablePositionIncrements = false;
            }
            result = s;
            result = new ElisionFilter(result, DEFAULT_ARTICLES);
            result = new IrishLowerCaseFilter(result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new SnowballFilter(result, new IrishStemmer());
            return(new TokenStreamComponents(source, result));
        }
コード例 #10
0
	  /// <summary>
	  /// Creates a
	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
	  /// </summary>
	  /// <returns> A
	  ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  ///         built from an <seealso cref="StandardTokenizer"/> filtered with
	  ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, 
	  ///         <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
	  ///         provided and <seealso cref="SnowballFilter"/>. </returns>
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		Tokenizer source = new StandardTokenizer(matchVersion, reader);
		TokenStream result = new StandardFilter(matchVersion, source);
		if (matchVersion.onOrAfter(Version.LUCENE_36))
		{
		  result = new ElisionFilter(result, DEFAULT_ARTICLES);
		}
		result = new LowerCaseFilter(matchVersion, result);
		result = new StopFilter(matchVersion, result, stopwords);
		if (!stemExclusionSet.Empty)
		{
		  result = new SetKeywordMarkerFilter(result, stemExclusionSet);
		}
		result = new SnowballFilter(result, new CatalanStemmer());
		return new TokenStreamComponents(source, result);
	  }
コード例 #11
0
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from a <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>,
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>,
        ///         <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided, and <seealso cref="FrenchLightStemFilter"/> </returns>
        ///         
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                Tokenizer source = new StandardTokenizer(matchVersion, reader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new ElisionFilter(result, DEFAULT_ARTICLES);
                result = new LowerCaseFilter(matchVersion, result);
                result = new StopFilter(matchVersion, result, stopwords);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
#pragma warning disable 612, 618
                if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
#pragma warning restore 612, 618
                {
                    result = new FrenchLightStemFilter(result);
                }
                else
                {
                    result = new SnowballFilter(result, new Tartarus.Snowball.Ext.FrenchStemmer());
                }
                return new TokenStreamComponents(source, result);
            }
            else
            {
                Tokenizer source = new StandardTokenizer(matchVersion, reader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new StopFilter(matchVersion, result, stopwords);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
#pragma warning disable 612, 618
                result = new FrenchStemFilter(result);
#pragma warning restore 612, 618
                // Convert to lowercase after stemming!
                return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
            }
        }
コード例 #12
0
ファイル: FrenchAnalyzer.cs プロジェクト: Cefa68000/lucenenet
	  /// <summary>
	  /// Creates
	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
	  /// </summary>
	  /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  ///         built from a <seealso cref="StandardTokenizer"/> filtered with
	  ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>,
	  ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>,
	  ///         <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
	  ///         provided, and <seealso cref="FrenchLightStemFilter"/> </returns>
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
		if (matchVersion.onOrAfter(Version.LUCENE_31))
		{
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		  Tokenizer source = new StandardTokenizer(matchVersion, reader);
		  TokenStream result = new StandardFilter(matchVersion, source);
		  result = new ElisionFilter(result, DEFAULT_ARTICLES);
		  result = new LowerCaseFilter(matchVersion, result);
		  result = new StopFilter(matchVersion, result, stopwords);
		  if (!excltable.Empty)
		  {
			result = new SetKeywordMarkerFilter(result, excltable);
		  }
		  if (matchVersion.onOrAfter(Version.LUCENE_36))
		  {
			result = new FrenchLightStemFilter(result);
		  }
		  else
		  {
			result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
		  }
		  return new TokenStreamComponents(source, result);
		}
		else
		{
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		  Tokenizer source = new StandardTokenizer(matchVersion, reader);
		  TokenStream result = new StandardFilter(matchVersion, source);
		  result = new StopFilter(matchVersion, result, stopwords);
		  if (!excltable.Empty)
		  {
			result = new SetKeywordMarkerFilter(result, excltable);
		  }
		  result = new FrenchStemFilter(result);
		  // Convert to lowercase after stemming!
		  return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
		}
	  }