//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testHugeDoc() throws java.io.IOException
	  public virtual void testHugeDoc()
	  {
		StringBuilder sb = new StringBuilder();
		char[] whitespace = new char[4094];
		Arrays.fill(whitespace, ' ');
		sb.Append(whitespace);
		sb.Append("testing 1234");
		string input = sb.ToString();
		StandardTokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
		BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new string[] {"testing", "1234"});
	  }
Example #2
0
 static void Main(string[] args)
 {
     var dawgFile = args.Length > 0 ? args[0] : "https://raw.githubusercontent.com/yamool/CWSharp/master/data/cwsharp.dawg";
     //BuildDawgFile(dawgFile);
     var tokenizer = new StandardTokenizer(dawgFile);
     foreach (var token in tokenizer.Traverse("研究生命起源"))
     {
         Console.Write(token.Text + "/" + token.Type);
         Console.Write(" ");
     }
     Console.ReadLine();
 }
Example #3
0
 static void Main()
 {
     var dawgFile = @"d:\dict.dawg";
     //BuildDawgFile(dawgFile);
     var tokenizer = new StandardTokenizer(dawgFile);
     foreach (var token in tokenizer.Traverse("研究生命起源"))
     {
         Console.Write(token.Text + "/" + token.Type);
         Console.Write(" ");
     }
     Console.ReadLine();
 }
Example #4
0
        static void Main(string[] args)
        {
            var dawgFile = args[0];
            Console.WriteLine("reading draw file: " + dawgFile);

            using (var stream = new FileStream(dawgFile, FileMode.Open, FileAccess.Read))
            {
                var tokenizer = new StandardTokenizer(stream);
                foreach (var token in tokenizer.Traverse("研究生命起源"))
                {
                    Console.Write(token.Text + "/" + token.Type);
                    Console.Write(" ");
                }
            }
        }
Example #5
0
        public GitHubIndex(Directory indexDirectory, Credentials credentials)
        {
            github = new GitHubClient(new ProductHeaderValue("LuceneNetDemo"))
            {
                Credentials = credentials
            };

            analyzer = new PerFieldAnalyzerWrapper(
                // Example of a pre-built custom analyzer
                defaultAnalyzer: new HtmlStripAnalyzer(GitHubIndex.MatchVersion),

                // Example of inline anonymous analyzers
                fieldAnalyzers: new Dictionary <string, Analyzer>
            {
                // Field analyzer for owner
                {
                    "owner",
                    Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
                    {
                        var source         = new KeywordTokenizer(reader);
                        TokenStream result = new ASCIIFoldingFilter(source);
                        result             = new LowerCaseFilter(GitHubIndex.MatchVersion, result);
                        return(new TokenStreamComponents(source, result));
                    })
                },
                // Field analyzer for name
                {
                    "name",
                    Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
                    {
                        var source         = new StandardTokenizer(GitHubIndex.MatchVersion, reader);
                        TokenStream result = new WordDelimiterFilter(GitHubIndex.MatchVersion, source, ~WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE, CharArraySet.EMPTY_SET);
                        result             = new ASCIIFoldingFilter(result);
                        result             = new LowerCaseFilter(GitHubIndex.MatchVersion, result);
                        return(new TokenStreamComponents(source, result));
                    })
                }
            });

            queryParser = new MultiFieldQueryParser(GitHubIndex.MatchVersion,
                                                    new[] { "name", "description", "readme" }, analyzer);


            indexWriter     = new IndexWriter(indexDirectory, new IndexWriterConfig(GitHubIndex.MatchVersion, analyzer));
            searcherManager = new SearcherManager(indexWriter, true, null);
        }
Example #6
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="SoraniNormalizationFilter"/>,
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="SoraniStemFilter"/>. </returns>
        protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);

            result = new SoraniNormalizationFilter(result);
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (!stemExclusionSet.Empty)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new SoraniStemFilter(result);
            return(new TokenStreamComponents(source, result));
        }
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            // Splits words at punctuation characters, removing punctuation.
            // Splits words at hyphens, unless there's a number in the token...
            // Recognizes email addresses and internet hostnames as one token.
            var tokenizer = new StandardTokenizer(_version, reader);

            TokenStream filter = new StandardFilter(tokenizer);

            // Normalizes token text to lower case.
            filter = new LowerCaseFilter(filter);

            // Removes stop words from a token stream.
            filter = new StopFilter(true, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);

            return(new NGramTokenFilter(filter, _minGram, _maxGram));
        }
Example #8
0
        /// <summary>
        /// Creates
        /// <see cref="TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> <see cref="TokenStreamComponents"/>
        ///         built from a <see cref="StandardTokenizer"/> filtered with
        ///         <see cref="GreekLowerCaseFilter"/>, <see cref="StandardFilter"/>,
        ///         <see cref="StopFilter"/>, and <see cref="GreekStemFilter"/> </returns>
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
            TokenStream result = new GreekLowerCaseFilter(m_matchVersion, source);

#pragma warning disable 612, 618
            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
            {
                result = new StandardFilter(m_matchVersion, result);
            }
            result = new StopFilter(m_matchVersion, result, m_stopwords);
            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                result = new GreekStemFilter(result);
            }
            return(new TokenStreamComponents(source, result));
        }
Example #9
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>,
        ///         <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="SnowballFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);

            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
            {
                result = new ElisionFilter(result, DEFAULT_ARTICLES);
            }
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new SnowballFilter(result, new CatalanStemmer());
            return(new TokenStreamComponents(source, result));
        }
        /// <summary>
        /// Creates
        /// <see cref="TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> <see cref="TokenStreamComponents"/>
        ///         built from a <see cref="StandardTokenizer"/> filtered with
        ///         <see cref="GreekLowerCaseFilter"/>, <see cref="StandardFilter"/>,
        ///         <see cref="StopFilter"/>, and <see cref="GreekStemFilter"/> </returns>
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
            TokenStream result = new GreekLowerCaseFilter(m_matchVersion, source);

            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
            {
                result = new StandardFilter(m_matchVersion, result);
            }
            result = new StopFilter(m_matchVersion, result, m_stopwords);
            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
            {
                result = new GreekStemFilter(result);
            }
            result = new GreekPhoneticFilter(result);
            result = new GreekAccentFilter(result);
            return(new TokenStreamComponents(source, result));
        }
Example #11
0
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from a <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="GreekLowerCaseFilter"/>, <seealso cref="StandardFilter"/>,
        ///         <seealso cref="StopFilter"/>, and <seealso cref="GreekStemFilter"/> </returns>
        protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new GreekLowerCaseFilter(matchVersion, source);

            if (matchVersion.onOrAfter(Version.LUCENE_31))
            {
                result = new StandardFilter(matchVersion, result);
            }
            result = new StopFilter(matchVersion, result, stopwords);
            if (matchVersion.onOrAfter(Version.LUCENE_31))
            {
                result = new GreekStemFilter(result);
            }
            return(new TokenStreamComponents(source, result));
        }
Example #12
0
        /// <summary>
        /// Creates a
        /// <see cref="TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> A
        ///         <see cref="TokenStreamComponents"/>
        ///         built from an <see cref="StandardTokenizer"/> filtered with
        ///         <see cref="StandardFilter"/>, <see cref="TurkishLowerCaseFilter"/>,
        ///         <see cref="StopFilter"/>, <see cref="SetKeywordMarkerFilter"/> if a stem
        ///         exclusion set is provided and <see cref="SnowballFilter"/>. </returns>
        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
            TokenStream result = new StandardFilter(m_matchVersion, source);

            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
            {
                result = new ApostropheFilter(result);
            }
            result = new TurkishLowerCaseFilter(result);
            result = new StopFilter(m_matchVersion, result, m_stopwords);
            if (stemExclusionSet.Any())
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new SnowballFilter(result, new TurkishStemmer());
            return(new TokenStreamComponents(source, result));
        }
    public override TokenStream TokenStream
        (string fieldName, System.IO.TextReader reader)
    {
        //create the tokenizer
        TokenStream result = new StandardTokenizer(reader);

        //add in filters
        // first normalize the StandardTokenizer
        result = new StandardFilter(result);
        // makes sure everything is lower case
        result = new LowerCaseFilter(result);
        // use the default list of Stop Words, provided by the StopAnalyzer class.
        result = new StopFilter(result, StopAnalyzer.ENGLISH_STOP_WORDS);
        // injects the synonyms.
        result = new SynonymFilter(result, SynonymEngine);
        //return the built token stream.
        return(result);
    }
Example #14
0
 public DynamicListRecognizer([CallerFilePath] string callerPath = "", [CallerLineNumber] int callerLine = 0)
     : base(callerPath, callerLine)
 {
     this.exactAnalyzer = new StandardAnalyzer(Utils.LuceneVersion.LUCENE_48);
     this.fuzzyAnalyzer = Analyzer.NewAnonymous((field, textReader) =>
     {
         Tokenizer tokenizer = new StandardTokenizer(Utils.LuceneVersion.LUCENE_48, textReader);
         // TokenStream stream = new DoubleMetaphoneFilter(tokenizer, 6, false);
         var factory = new BeiderMorseFilterFactory(new Dictionary <string, string>()
         {
             { "nameType", NameType.GENERIC.ToString() },
             { "ruleType", RuleType.APPROX.ToString() },
             { "languageSet", "auto" }
         });
         TokenStream stream = factory.Create(tokenizer);
         return(new TokenStreamComponents(tokenizer, stream));
     });
 }
Example #15
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            var tokenizer = new StandardTokenizer(version, reader);
            var shingler  = new ShingleFilter(tokenizer, minGramSize, maxGramSize);

            if (!this.ShowUnigrams)
            {
                shingler.SetOutputUnigrams(false);
            }
            else
            {
                shingler.SetOutputUnigrams(true);
            }
            var filter = new StopFilter(version, new LowerCaseFilter(version, shingler),
                                        StopAnalyzer.ENGLISH_STOP_WORDS_SET);

            return(new TokenStreamComponents(tokenizer, filter));
        }
Example #16
0
        /**
         * 将句子列表转化为文档
         *
         * @param sentenceList
         * @return
         */
        private static List <List <String> > convertSentenceListToDocument(List <String> sentenceList)
        {
            List <List <String> > docs = new List <List <String> >(sentenceList.Count);

            foreach (String sentence in sentenceList)
            {
                List <Term>   termList = StandardTokenizer.segment(sentence.ToCharArray());
                List <String> wordList = new List <String>();
                foreach (Term term in termList)
                {
                    if (CoreStopWordDictionary.shouldInclude(term))
                    {
                        wordList.Add(term.word);
                    }
                }
                docs.Add(wordList);
            }
            return(docs);
        }
Example #17
0
        /// <summary>Constructs a <see cref="StandardTokenizer" /> filtered by a <see cref="StandardFilter" />
        ///, a <see cref="LowerCaseFilter" /> and a <see cref="StopFilter" />.
        /// </summary>
        public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
        {
            StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader);
            TokenStream       result      = new StandardFilter(tokenStream);

            result = new LowerCaseFilter(result);
            if (stopSet != null)
            {
                result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet);
            }

            //Now, our Stemming filter goes here
            result = new BestBetsWordFormsFilter(result);

            //This will remove duplicate keywords - bad for best bets/term count matching
            result = new RemoveDuplicatesTokenFilter(result);

            return(result);
        }
Example #18
0
        /// <summary>
        /// Creates
        /// <see cref="TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> <see cref="TokenStreamComponents"/>
        ///         built from a <see cref="StandardTokenizer"/> filtered with
        ///         <see cref="StandardFilter"/>, <see cref="ElisionFilter"/>,
        ///         <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>,
        ///         <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided, and <see cref="FrenchLightStemFilter"/> </returns>
        ///
        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
                TokenStream result = new StandardFilter(m_matchVersion, source);
                result = new ElisionFilter(result, DEFAULT_ARTICLES);
                result = new LowerCaseFilter(m_matchVersion, result);
                result = new StopFilter(m_matchVersion, result, m_stopwords);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
#pragma warning disable 612, 618
                if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
#pragma warning restore 612, 618
                {
                    result = new FrenchLightStemFilter(result);
                }
                else
                {
                    result = new SnowballFilter(result, new Tartarus.Snowball.Ext.FrenchStemmer());
                }
                return(new TokenStreamComponents(source, result));
            }
            else
            {
                Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
                TokenStream result = new StandardFilter(m_matchVersion, source);
                result = new StopFilter(m_matchVersion, result, m_stopwords);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
#pragma warning disable 612, 618
                result = new FrenchStemFilter(result);
#pragma warning restore 612, 618
                // Convert to lowercase after stemming!
                return(new TokenStreamComponents(source, new LowerCaseFilter(m_matchVersion, result)));
            }
        }
Example #19
0
        /// <summary>
        /// Creates
        /// <see cref="TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> <see cref="TokenStreamComponents"/>
        ///         built from a <see cref="StandardTokenizer"/> filtered with
        ///         <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>,
        ///         and <see cref="CzechStemFilter"/> (only if version is >= LUCENE_31). If
        ///         a version is >= LUCENE_31 and a stem exclusion set is provided via
        ///         <see cref="CzechAnalyzer(LuceneVersion, CharArraySet, CharArraySet)"/> a
        ///         <see cref="SetKeywordMarkerFilter"/> is added before
        ///         <see cref="CzechStemFilter"/>. </returns>

        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
            TokenStream result = new StandardFilter(m_matchVersion, source);

            result = new LowerCaseFilter(m_matchVersion, result);
            result = new StopFilter(m_matchVersion, result, m_stopwords);
#pragma warning disable 612, 618
            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                if (this.stemExclusionTable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, stemExclusionTable);
                }
                result = new CzechStemFilter(result);
            }
            return(new TokenStreamComponents(source, result));
        }
Example #20
0
        public virtual void TestChangedOffsets()
        {
            NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
            builder.Add("a", "一二");
            builder.Add("b", "二三");
            NormalizeCharMap norm     = builder.Build();
            Analyzer         analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
                Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
                return(new TokenStreamComponents(tokenizer, new CJKBigramFilter(tokenizer)));
            }, initReader: (fieldName, reader) => new MappingCharFilter(norm, reader));

            AssertAnalyzesTo(analyzer, "ab", new string[] { "一二", "二二", "二三" }, new int[] { 0, 0, 1 }, new int[] { 1, 1, 2 });

            // note: offsets are strange since this is how the charfilter maps them...
            // before bigramming, the 4 tokens look like:
            //   { 0, 0, 1, 1 },
            //   { 0, 1, 1, 2 }
        }
Example #21
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="EnglishPossessiveFilter"/>,
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="PorterStemFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);

            // prior to this we get the classic behavior, standardfilter does it for us.
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
            {
                result = new EnglishPossessiveFilter(matchVersion, result);
            }
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (stemExclusionSet.Any())
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new PorterStemFilter(result);
            return(new TokenStreamComponents(source, result));
        }
Example #22
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="TurkishLowerCaseFilter"/>,
        ///         <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem
        ///         exclusion set is provided and <seealso cref="SnowballFilter"/>. </returns>
        protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);

            if (matchVersion.onOrAfter(Version.LUCENE_48))
            {
                result = new ApostropheFilter(result);
            }
            result = new TurkishLowerCaseFilter(result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (!stemExclusionSet.Empty)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new SnowballFilter(result, new TurkishStemmer());
            return(new TokenStreamComponents(source, result));
        }
        /// <summary>
        /// Creates a TokenStream which tokenizes all the text in the provided Reader.
        /// </summary>
        /// <returns>
        /// A TokenStream build from a StandardTokenizer filtered with
        ///     StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
        /// </returns>
        public override TokenStream TokenStream(String fieldName, TextReader reader)
        {
            if (fieldName == null)
            {
                throw new ArgumentException("fieldName must not be null");
            }
            if (reader == null)
            {
                throw new ArgumentException("readermust not be null");
            }

            TokenStream result = new StandardTokenizer(reader);

            result = new StandardFilter(result);
            result = new StopFilter(result, stoptable);
            result = new FrenchStemFilter(result, excltable);
            // Convert to lowercase after stemming!
            result = new LowerCaseFilter(result);
            return(result);
        }
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader);

            //result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            if (STOP_WORDS != null)
            {
                result = new StopFilter(false, result, STOP_WORDS);
            }
            result = new ASCIIFoldingFilter(result);

            // we are using a distinct version of the Spanish stemmer, called Spanish2
            // Please check if this class can be found in the Snowball library, the relative path
            // should be: Snowball\SF\Snowball\Ext\
            // just in case, I would leave a copy of this class in this project
            result = new SnowballFilter(result, "Spanish");

            return(result);
        }
Example #25
0
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from a <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>,
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>,
        ///         <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided, and <seealso cref="FrenchLightStemFilter"/> </returns>
        protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
        {
            if (matchVersion.onOrAfter(Version.LUCENE_31))
            {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
                Tokenizer   source = new StandardTokenizer(matchVersion, reader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new ElisionFilter(result, DEFAULT_ARTICLES);
                result = new LowerCaseFilter(matchVersion, result);
                result = new StopFilter(matchVersion, result, stopwords);
                if (!excltable.Empty)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
                if (matchVersion.onOrAfter(Version.LUCENE_36))
                {
                    result = new FrenchLightStemFilter(result);
                }
                else
                {
                    result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
                }
                return(new TokenStreamComponents(source, result));
            }
            else
            {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
                Tokenizer   source = new StandardTokenizer(matchVersion, reader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new StopFilter(matchVersion, result, stopwords);
                if (!excltable.Empty)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
                result = new FrenchStemFilter(result);
                // Convert to lowercase after stemming!
                return(new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result)));
            }
        }
Example #26
0
        override public TokenStream TokenStream(string fieldName, TextReader reader)
        {
            StandardTokenizer tokenStream = new StandardTokenizer(VERSION, reader);

            tokenStream.MaxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
            TokenStream result = new StandardFilter(tokenStream);

            result = new LowerCaseFilter(result);

            if (SettingsViewModel.Instance.StopWords == true)
            {
                result = new StopFilter(enableSPI, result, StopSet);
            }

            if (SettingsViewModel.Instance.Stemming == true)
            {
                result = new PorterStemFilter(result);
            }

            return(result);
        }
Example #27
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="PortugueseLightStemFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);

            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
            {
                result = new PortugueseLightStemFilter(result);
            }
            else
            {
                result = new SnowballFilter(result, new Tartarus.Snowball.Ext.PortugueseStemmer());
            }
            return(new TokenStreamComponents(source, result));
        }
Example #28
0
 /// <summary>
 /// Creates
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from a <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="ThaiWordFilter"/>, and
 ///         <seealso cref="StopFilter"/> </returns>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
     {
         Tokenizer   source = new ThaiTokenizer(reader);
         TokenStream result = new LowerCaseFilter(matchVersion, source);
         result = new StopFilter(matchVersion, result, stopwords);
         return(new TokenStreamComponents(source, result));
     }
     else
     {
         Tokenizer   source = new StandardTokenizer(matchVersion, reader);
         TokenStream result = new StandardFilter(matchVersion, source);
         if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
         {
             result = new LowerCaseFilter(matchVersion, result);
         }
         result = new ThaiWordFilter(matchVersion, result);
         return(new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)));
     }
 }
Example #29
0
        protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
        {
            if (matchVersion.onOrAfter(Version.LUCENE_36))
            {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
                Tokenizer source = new StandardTokenizer(matchVersion, reader);
                // run the widthfilter first before bigramming, it sometimes combines characters.
                TokenStream result = new CJKWidthFilter(source);
                result = new LowerCaseFilter(matchVersion, result);
                result = new CJKBigramFilter(result);
                return(new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)));
            }
            else
            {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new CJKTokenizer(reader);
                Tokenizer source = new CJKTokenizer(reader);
                return(new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords)));
            }
        }
Example #30
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="IrishLowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="SnowballFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);
            StopFilter  s      = new StopFilter(matchVersion, result, HYPHENATIONS);

            if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_44))
            {
                s.EnablePositionIncrements = false;
            }
            result = s;
            result = new ElisionFilter(result, DEFAULT_ARTICLES);
            result = new IrishLowerCaseFilter(result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new SnowballFilter(result, new IrishStemmer());
            return(new TokenStreamComponents(source, result));
        }
Example #31
0
        /// <summary>
        /// Creates a
        /// <see cref="TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> A
        ///         <see cref="TokenStreamComponents"/>
        ///         built from an <see cref="StandardTokenizer"/> filtered with
        ///         <see cref="StandardFilter"/>, <see cref="EnglishPossessiveFilter"/>,
        ///         <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>,
        ///         <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <see cref="PorterStemFilter"/>. </returns>
        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
            TokenStream result = new StandardFilter(m_matchVersion, source);

            // prior to this we get the classic behavior, standardfilter does it for us.
#pragma warning disable 612, 618
            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                result = new EnglishPossessiveFilter(m_matchVersion, result);
            }
            result = new LowerCaseFilter(m_matchVersion, result);
            result = new StopFilter(m_matchVersion, result, m_stopwords);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new PorterStemFilter(result);
            return(new TokenStreamComponents(source, result));
        }
Example #32
0
        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
#pragma warning restore 612, 618
            {
                Tokenizer source = new StandardTokenizer(m_matchVersion, reader);
                // run the widthfilter first before bigramming, it sometimes combines characters.
                TokenStream result = new CJKWidthFilter(source);
                result = new LowerCaseFilter(m_matchVersion, result);
                result = new CJKBigramFilter(result);
                return(new TokenStreamComponents(source, new StopFilter(m_matchVersion, result, m_stopwords)));
            }
            else
            {
#pragma warning disable 612, 618
                Tokenizer source = new CJKTokenizer(reader);
#pragma warning restore 612, 618
                return(new TokenStreamComponents(source, new StopFilter(m_matchVersion, source, m_stopwords)));
            }
        }
Example #33
0
        static List <string> TokenizeStandard(string content, TokenizeConfig config)
        {
            StringReader reader = new StringReader(content);
            TokenStream  result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_24, reader);

            var stophash = StopFilter.MakeStopSet(config.StopWords);

            result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            result = new StopFilter(true, result, stophash, true);

            /// Set up lexicon/invertlexicon, featurevectors, wordappearancecount ///
            result.Reset();
            TermAttribute termattr = (TermAttribute)result.GetAttribute(typeof(TermAttribute));
            List <string> words    = new List <string>();

            while (result.IncrementToken())
            {
                words.Add(termattr.Term());
            }
            return(words);
        }
Example #34
0
        public TokenStream SpanishSteammer(TokenStream tokenStream)
        {
            //Obtener en una cadena cada token y aplicar el lematizador a cada término
            string      term    = string.Empty;
            IStemmer    stemmer = new SpanishStemmer();
            TokenStream tokenStreamtemp;
            //var termAttr = tokenStream.GetAttribute<ITermAttribute>();
            string cadena = "";

            string[] token;
            Lucene.Net.Analysis.Token current;
            while ((current = tokenStream.Next()) != null)
            {
                cadena = current.ToString();
                token  = cadena.Split(',');
                cadena = cadena.Substring(1, token[0].Length - 1);
                term   = term + " " + stemmer.Stem(cadena);
            }
            tokenStreamtemp = new StandardTokenizer(new System.IO.StringReader(term));
            return(tokenStreamtemp);
            //
        }
Example #35
0
	  /// <summary>
	  /// Creates
	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
	  /// </summary>
	  /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  ///         built from a <seealso cref="StandardTokenizer"/> filtered with
	  ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StandardFilter"/>, <seealso cref="StopFilter"/>
	  ///         , and <seealso cref="BrazilianStemFilter"/>. </returns>
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
		Tokenizer source = new StandardTokenizer(matchVersion, reader);
		TokenStream result = new LowerCaseFilter(matchVersion, source);
		result = new StandardFilter(matchVersion, result);
		result = new StopFilter(matchVersion, result, stopwords);
		if (excltable != null && !excltable.Empty)
		{
		  result = new SetKeywordMarkerFilter(result, excltable);
		}
		return new TokenStreamComponents(source, new BrazilianStemFilter(result));
	  }
		  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
		  {

			Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
			return new TokenStreamComponents(tokenizer);
		  }
Example #37
0
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
		if (matchVersion.onOrAfter(Version.LUCENE_36))
		{
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		  Tokenizer source = new StandardTokenizer(matchVersion, reader);
		  // run the widthfilter first before bigramming, it sometimes combines characters.
		  TokenStream result = new CJKWidthFilter(source);
		  result = new LowerCaseFilter(matchVersion, result);
		  result = new CJKBigramFilter(result);
		  return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
		}
		else
		{
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new CJKTokenizer(reader);
		  Tokenizer source = new CJKTokenizer(reader);
		  return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
		}
	  }
Example #38
0
	  /// <summary>
	  /// Creates a
	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
	  /// </summary>
	  /// <returns> A
	  ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  ///         built from an <seealso cref="StandardTokenizer"/> filtered with
	  ///         <seealso cref="StandardFilter"/>, <seealso cref="SoraniNormalizationFilter"/>, 
	  ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
	  ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
	  ///         provided and <seealso cref="SoraniStemFilter"/>. </returns>
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		Tokenizer source = new StandardTokenizer(matchVersion, reader);
		TokenStream result = new StandardFilter(matchVersion, source);
		result = new SoraniNormalizationFilter(result);
		result = new LowerCaseFilter(matchVersion, result);
		result = new StopFilter(matchVersion, result, stopwords);
		if (!stemExclusionSet.Empty)
		{
		  result = new SetKeywordMarkerFilter(result, stemExclusionSet);
		}
		result = new SoraniStemFilter(result);
		return new TokenStreamComponents(source, result);
	  }
Example #39
0
 /// <summary>
 /// Constructs a <seealso cref="StandardTokenizer"/> filtered by a {@link
 ///    StandardFilter}, a <seealso cref="LowerCaseFilter"/>, a <seealso cref="StopFilter"/>,
 ///    and a <seealso cref="SnowballFilter"/> 
 /// </summary>
 public override TokenStreamComponents createComponents(string fieldName, Reader reader)
 {
     Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, tokenizer);
     // remove the possessive 's for english stemmers
     if (matchVersion.onOrAfter(Version.LUCENE_31) && (name.Equals("English") || name.Equals("Porter") || name.Equals("Lovins")))
     {
       result = new EnglishPossessiveFilter(result);
     }
     // Use a special lowercase filter for turkish, the stemmer expects it.
     if (matchVersion.onOrAfter(Version.LUCENE_31) && name.Equals("Turkish"))
     {
       result = new TurkishLowerCaseFilter(result);
     }
     else
     {
       result = new LowerCaseFilter(matchVersion, result);
     }
     if (stopSet != null)
     {
       result = new StopFilter(matchVersion, result, stopSet);
     }
     result = new SnowballFilter(result, name);
     return new TokenStreamComponents(tokenizer, result);
 }
		  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
		  {
			Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
			TokenStream tokenStream = new MockGraphTokenFilter(random(), tokenizer);
			return new TokenStreamComponents(tokenizer, tokenStream);
		  }
Example #41
0
	  /// <summary>
	  /// Creates
	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
	  /// </summary>
	  /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  ///         built from a <seealso cref="StandardTokenizer"/> filtered with
	  ///         <seealso cref="GreekLowerCaseFilter"/>, <seealso cref="StandardFilter"/>,
	  ///         <seealso cref="StopFilter"/>, and <seealso cref="GreekStemFilter"/> </returns>
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		Tokenizer source = new StandardTokenizer(matchVersion, reader);
		TokenStream result = new GreekLowerCaseFilter(matchVersion, source);
		if (matchVersion.onOrAfter(Version.LUCENE_31))
		{
		  result = new StandardFilter(matchVersion, result);
		}
		result = new StopFilter(matchVersion, result, stopwords);
		if (matchVersion.onOrAfter(Version.LUCENE_31))
		{
		  result = new GreekStemFilter(result);
		}
		return new TokenStreamComponents(source, result);
	  }
Example #42
0
	  /// <summary>
	  /// Creates a
	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
	  /// </summary>
	  /// <returns> A
	  ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  ///         built from an <seealso cref="StandardTokenizer"/> filtered with
	  ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, 
	  ///         <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
	  ///         provided and <seealso cref="SnowballFilter"/>. </returns>
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		Tokenizer source = new StandardTokenizer(matchVersion, reader);
		TokenStream result = new StandardFilter(matchVersion, source);
		if (matchVersion.onOrAfter(Version.LUCENE_36))
		{
		  result = new ElisionFilter(result, DEFAULT_ARTICLES);
		}
		result = new LowerCaseFilter(matchVersion, result);
		result = new StopFilter(matchVersion, result, stopwords);
		if (!stemExclusionSet.Empty)
		{
		  result = new SetKeywordMarkerFilter(result, stemExclusionSet);
		}
		result = new SnowballFilter(result, new CatalanStemmer());
		return new TokenStreamComponents(source, result);
	  }
		  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
		  {
			Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_40, reader);
			return new TokenStreamComponents(tokenizer);
		  }
Example #44
0
	  /// <summary>
	  /// Creates
	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
	  /// </summary>
	  /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  ///         built from a <seealso cref="StandardTokenizer"/> filtered with
	  ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>,
	  ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>,
	  ///         <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
	  ///         provided, and <seealso cref="FrenchLightStemFilter"/> </returns>
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
		if (matchVersion.onOrAfter(Version.LUCENE_31))
		{
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		  Tokenizer source = new StandardTokenizer(matchVersion, reader);
		  TokenStream result = new StandardFilter(matchVersion, source);
		  result = new ElisionFilter(result, DEFAULT_ARTICLES);
		  result = new LowerCaseFilter(matchVersion, result);
		  result = new StopFilter(matchVersion, result, stopwords);
		  if (!excltable.Empty)
		  {
			result = new SetKeywordMarkerFilter(result, excltable);
		  }
		  if (matchVersion.onOrAfter(Version.LUCENE_36))
		  {
			result = new FrenchLightStemFilter(result);
		  }
		  else
		  {
			result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
		  }
		  return new TokenStreamComponents(source, result);
		}
		else
		{
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		  Tokenizer source = new StandardTokenizer(matchVersion, reader);
		  TokenStream result = new StandardFilter(matchVersion, source);
		  result = new StopFilter(matchVersion, result, stopwords);
		  if (!excltable.Empty)
		  {
			result = new SetKeywordMarkerFilter(result, excltable);
		  }
		  result = new FrenchStemFilter(result);
		  // Convert to lowercase after stemming!
		  return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
		}
	  }