protected override TokenStreamComponents createComponents(string field, java.io.Reader reader)
 {
     var tokenizer = new PathTokenizer(reader);
     TokenStream tokenStream = new StandardFilter(tokenizer);
     tokenStream = new LowerCaseFilter(tokenStream);
     tokenStream = new StopFilter(tokenStream, StandardAnalyzer.STOP_WORDS_SET);
     return new TokenStreamComponents(tokenizer, tokenStream);
 }
 public override TokenStream TokenStream(string fieldname, TextReader reader)
 {
     TokenStream result = new StandardTokenizer(_version, reader);
     result = new LowerCaseFilter(result);
     result = new PersianNormalizationFilter(result);
     result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(_version), result, _stoptable);
     result = new PersianStemFilter(result);
     return result;
 }
예제 #3
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            TokenStreamComponents tokenStreamComponents = null;
            Tokenizer             tokenizer             = new StandardTokenizer(matchVersion, reader);
            TokenStream           stream = new StandardFilter(matchVersion, tokenizer);

            stream = new LowerCaseFilter(matchVersion, stream);
            stream = new StopFilter(matchVersion, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
            stream = new PorterStemFilter(stream);
            stream = new SnowballFilter(stream, new EnglishStemmer());

            tokenStreamComponents = new TokenStreamComponents(tokenizer, stream);

            return(tokenStreamComponents);
        }
예제 #4
0
        /// <summary>
        /// Creates a <see cref="TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns>
        /// A <see cref="TokenStreamComponents"/> built from an <see cref="StandardTokenizer"/>
        /// filtered with <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>,
        /// <see cref="SetKeywordMarkerFilter"/> if a stem excusion set is provided and <see cref="StempelFilter"/>.
        /// </returns>
        protected internal override TokenStreamComponents CreateComponents(string fieldName,
                                                                           TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
            TokenStream result = new StandardFilter(m_matchVersion, source);

            result = new LowerCaseFilter(m_matchVersion, result);
            result = new StopFilter(m_matchVersion, result, m_stopwords);
            if (stemExclusionSet.Any())
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new StempelFilter(result, new StempelStemmer(stemTable));
            return(new TokenStreamComponents(source, result));
        }
예제 #5
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            //TokenStream result = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new WhitespaceTokenizer(@reader);

            result = new StandardFilter(result);
            result = new EnglishPossessiveFilter(result);
            result = new LowerCaseFilter(result);
            //La ricerca nel titolo deve indicizzare tutto, pertanto niente stopwords
            //result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopTable);
            result = new PorterStemFilter(result);
            //Per gestire la creazione di parole tagliando i simboli
            result = new SymbolsFilter(result);
            return(result);
        }
예제 #6
0
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>,
        ///         <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/>
        ///         if a stem exclusion set is provided and <seealso cref="IndonesianStemFilter"/>. </returns>
        protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);

            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (!stemExclusionSet.Empty)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            return(new TokenStreamComponents(source, new IndonesianStemFilter(result)));
        }
예제 #7
0
        /// <summary>
        /// Creates a
        /// <see cref="TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> A
        ///         <see cref="TokenStreamComponents"/>
        ///         built from an <see cref="StandardTokenizer"/> filtered with
        ///         <see cref="StandardFilter"/>, <see cref="SoraniNormalizationFilter"/>,
        ///         <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>,
        ///         <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <see cref="SoraniStemFilter"/>. </returns>
        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
            TokenStream result = new StandardFilter(m_matchVersion, source);

            result = new SoraniNormalizationFilter(result);
            result = new LowerCaseFilter(m_matchVersion, result);
            result = new StopFilter(m_matchVersion, result, m_stopwords);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new SoraniStemFilter(result);
            return(new TokenStreamComponents(source, result));
        }
예제 #8
0
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>,
        ///         <seealso cref="ArabicNormalizationFilter"/>, <seealso cref="SetKeywordMarkerFilter"/>
        ///         if a stem exclusion set is provided and <seealso cref="ArabicStemFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) ? new StandardTokenizer(matchVersion, reader) : (Tokenizer) new ArabicLetterTokenizer(matchVersion, reader);
            TokenStream result = new LowerCaseFilter(matchVersion, source);

            // the order here is important: the stopword list is not normalized!
            result = new StopFilter(matchVersion, result, stopwords);
            // TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
            result = new ArabicNormalizationFilter(result);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            return(new TokenStreamComponents(source, new ArabicStemFilter(result)));
        }
        /// <summary>
        /// Creates a <see cref="TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <param name="fieldName"></param>
        /// <param name="reader"></param>
        /// <returns>A <see cref="TokenStreamComponents"/> built from a <see cref="StandardTokenizer"/>
        /// filtered with <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>, <see cref="SetKeywordMarkerFilter"/>
        /// if a stem exclusion set is provided and <see cref="MorfologikFilter"/> on the Ukrainian dictionary.</returns>
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
            TokenStream result = new LowerCaseFilter(m_matchVersion, source);

            result = new StopFilter(m_matchVersion, result, m_stopwords);

            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }

            result = new MorfologikFilter(result, GetDictionary());
            return(new TokenStreamComponents(source, result));
        }
예제 #10
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader);

            //result = new StandardFilter(result);
            result = new LowerCaseFilter(result);

            if (STOP_WORDS != null)
            {
                result = new StopFilter(false, result, STOP_WORDS);
            }
            result = new ASCIIFoldingFilter(result);
            result = new SnowballFilter(result, "English");

            return(result);
        }
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            var attributeSource = new AttributeSource();

            attributeSource.AddAttributeImpl(new SpellAttribute());
            attributeSource.AddAttributeImpl(new StemAttribute());

            var tokenizer       = new RussianLetterTokenizer(attributeSource, reader);
            var lowercaseFilter = new LowerCaseFilter(tokenizer);
            var badWordsFilter  = new BadWordsFilter(lowercaseFilter);
            var stopWordFilter  = new StopFilter(false, badWordsFilter, StopWords);
            var preFilter       = new StemFilter(stopWordFilter, SpellChecker, NumberOfSuggestions);
            var similarFilter   = new SimilarFilter(preFilter);

            return(similarFilter);
        }
예제 #12
0
        /// <summary>
        ///
        /// </summary>
        /// <remarks></remarks>
        /// <seealso cref=""/>
        /// <param name="fieldName"></param>
        /// <param name="reader"></param>
        /// <returns></returns>
        public override TokenStream TokenStream(String fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_30, reader);

            result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            result = new StopFilter(true, result, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
            result = new NGramTokenFilter(result, 1, 30);

            //result = new StopFilter(true, result, stoptable);
            //result = new PorterStemFilter(result);

            //result = new GermanStemFilter(result, excltable);

            return(result);
        }
예제 #13
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testMultipleSources() throws Exception
        public virtual void testMultipleSources()
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new java.io.StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
            TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.ToString()), MockTokenizer.WHITESPACE, false));

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
            TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
            TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
            tee1.reset();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final TokenStream source1 = new CachingTokenFilter(tee1);
            TokenStream source1 = new CachingTokenFilter(tee1);

            tee1.addAttribute(typeof(CheckClearAttributesAttribute));
            dogDetector.addAttribute(typeof(CheckClearAttributesAttribute));
            theDetector.addAttribute(typeof(CheckClearAttributesAttribute));

//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new java.io.StringReader(buffer2.toString()), MockTokenizer.WHITESPACE, false));
            TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.ToString()), MockTokenizer.WHITESPACE, false));

            tee2.addSinkTokenStream(dogDetector);
            tee2.addSinkTokenStream(theDetector);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final TokenStream source2 = tee2;
            TokenStream source2 = tee2;

            assertTokenStreamContents(source1, tokens1);
            assertTokenStreamContents(source2, tokens2);

            assertTokenStreamContents(theDetector, new string[] { "The", "the", "The", "the" });
            assertTokenStreamContents(dogDetector, new string[] { "Dogs", "Dogs" });

            source1.reset();
            TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1);

            string[] lowerCaseTokens = new string[tokens1.Length];
            for (int i = 0; i < tokens1.Length; i++)
            {
                lowerCaseTokens[i] = tokens1[i].ToLower(Locale.ROOT);
            }
            assertTokenStreamContents(lowerCasing, lowerCaseTokens);
        }
예제 #14
0
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>,
        ///         <seealso cref="ArabicNormalizationFilter"/>, <seealso cref="SetKeywordMarkerFilter"/>
        ///         if a stem exclusion set is provided and <seealso cref="ArabicStemFilter"/>. </returns>
        protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = matchVersion.onOrAfter(org.apache.lucene.util.Version.LUCENE_31) ? new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader);
            Tokenizer   source = matchVersion.onOrAfter(Version.LUCENE_31) ? new StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader);
            TokenStream result = new LowerCaseFilter(matchVersion, source);

            // the order here is important: the stopword list is not normalized!
            result = new StopFilter(matchVersion, result, stopwords);
            // TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
            result = new ArabicNormalizationFilter(result);
            if (!stemExclusionSet.Empty)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            return(new TokenStreamComponents(source, new ArabicStemFilter(result)));
        }
예제 #15
0
        /*
         * Creates a {@link TokenStream} which tokenizes all the text in the provided
         * {@link Reader}.
         *
         * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
         *         filtered with {@link LowerCaseFilter},
         *         {@link ArabicNormalizationFilter},
         *         {@link PersianNormalizationFilter} and Persian Stop words
         */
        public override TokenStream TokenStream(String fieldName, TextReader reader)
        {
            TokenStream result = new ArabicLetterTokenizer(reader);

            result = new LowerCaseFilter(result);
            result = new ArabicNormalizationFilter(result);
            /* additional persian-specific normalization */
            result = new PersianNormalizationFilter(result);

            /*
             * the order here is important: the stopword list is normalized with the
             * above!
             */
            result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                    result, stoptable);
            return(result);
        }
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            // Splits words at punctuation characters, removing punctuation.
            // Splits words at hyphens, unless there's a number in the token...
            // Recognizes email addresses and internet hostnames as one token.
            var tokenizer = new StandardTokenizer(_version, reader);

            TokenStream filter = new StandardFilter(tokenizer);

            // Normalizes token text to lower case.
            filter = new LowerCaseFilter(filter);

            // Removes stop words from a token stream.
            filter = new StopFilter(true, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);

            return(new NGramTokenFilter(filter, _minGram, _maxGram));
        }
예제 #17
0
 /// <summary>
 /// Test that LowercaseFilter handles the lowercasing correctly if the term
 /// buffer has a trailing surrogate character leftover and the current term in
 /// the buffer ends with a corresponding leading surrogate.
 /// </summary>
 //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
 //ORIGINAL LINE: public void testLowerCaseFilterLowSurrogateLeftover() throws java.io.IOException
 public virtual void testLowerCaseFilterLowSurrogateLeftover()
 {
     // test if the limit of the termbuffer is correctly used with supplementary
     // chars
     WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("BogustermBogusterm\udc16"));
     LowerCaseFilter filter = new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer);
     assertTokenStreamContents(filter, new string[] {"bogustermbogusterm\udc16"});
     filter.reset();
     string highSurEndingUpper = "BogustermBoguster\ud801";
     string highSurEndingLower = "bogustermboguster\ud801";
     tokenizer.Reader = new StringReader(highSurEndingUpper);
     assertTokenStreamContents(filter, new string[] {highSurEndingLower});
     assertTrue(filter.hasAttribute(typeof(CharTermAttribute)));
     char[] termBuffer = filter.getAttribute(typeof(CharTermAttribute)).buffer();
     int length = highSurEndingLower.Length;
     assertEquals('\ud801', termBuffer[length - 1]);
 }
예제 #18
0
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
     {
         Tokenizer source = new StandardTokenizer(matchVersion, reader);
         // run the widthfilter first before bigramming, it sometimes combines characters.
         TokenStream result = new CJKWidthFilter(source);
         result = new LowerCaseFilter(matchVersion, result);
         result = new CJKBigramFilter(result);
         return(new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)));
     }
     else
     {
         Tokenizer source = new CJKTokenizer(reader);
         return(new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords)));
     }
 }
예제 #19
0
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from a <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , and <seealso cref="CzechStemFilter"/> (only if version is >= LUCENE_31). If
        ///         a version is >= LUCENE_31 and a stem exclusion set is provided via
        ///         <seealso cref="#CzechAnalyzer(Version, CharArraySet, CharArraySet)"/> a
        ///         <seealso cref="SetKeywordMarkerFilter"/> is added before
        ///         <seealso cref="CzechStemFilter"/>. </returns>

        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);

            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
            {
                if (this.stemExclusionTable.Any())
                {
                    result = new SetKeywordMarkerFilter(result, stemExclusionTable);
                }
                result = new CzechStemFilter(result);
            }
            return(new TokenStreamComponents(source, result));
        }
예제 #20
0
        public GitHubIndex(Directory indexDirectory, string githubApiKey)
        {
            github = new GitHubClient(new ProductHeaderValue("LuceneNetDemo"))
            {
                Credentials = new Credentials(githubApiKey)
            };

            analyzer = new PerFieldAnalyzerWrapper(
                // Example of a pre-built custom analyzer
                defaultAnalyzer: new HtmlStripAnalyzer(GitHubIndex.MatchVersion),

                // Example of inline anonymous analyzers
                fieldAnalyzers: new Dictionary <string, Analyzer>
            {
                // Field analyzer for owner
                {
                    "owner",
                    Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
                    {
                        var source         = new KeywordTokenizer(reader);
                        TokenStream result = new ASCIIFoldingFilter(source);
                        result             = new LowerCaseFilter(GitHubIndex.MatchVersion, result);
                        return(new TokenStreamComponents(source, result));
                    })
                },
                // Field analyzer for name
                {
                    "name",
                    Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
                    {
                        var source         = new StandardTokenizer(GitHubIndex.MatchVersion, reader);
                        TokenStream result = new WordDelimiterFilter(GitHubIndex.MatchVersion, source, ~WordDelimiterFlags.STEM_ENGLISH_POSSESSIVE, CharArraySet.EMPTY_SET);
                        result             = new ASCIIFoldingFilter(result);
                        result             = new LowerCaseFilter(GitHubIndex.MatchVersion, result);
                        return(new TokenStreamComponents(source, result));
                    })
                }
            });

            queryParser = new MultiFieldQueryParser(GitHubIndex.MatchVersion,
                                                    new[] { "name", "description", "readme" }, analyzer);


            indexWriter     = new IndexWriter(indexDirectory, new IndexWriterConfig(GitHubIndex.MatchVersion, analyzer));
            searcherManager = new SearcherManager(indexWriter, true, null);
        }
예제 #21
0
        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            if (_adapter.IsNumericField(fieldName) ||
                _adapter.IsNotAnalyzed(fieldName))
            {
                var tokenizer = new KeywordTokenizer(reader);
                var filter    = new LowerCaseFilter(LuceneVersion.LUCENE_48, tokenizer);
                return(new TokenStreamComponents(tokenizer, filter));
            }
            else
            {
                var tokenizer         = new MtgTokenizer(reader);
                var lowerCaseFilter   = new LowerCaseFilter(LuceneVersion.LUCENE_48, tokenizer);
                var replacementFilter = new ReplaceFilter(lowerCaseFilter, MtgAplhabet.Replacements);

                return(new TokenStreamComponents(tokenizer, replacementFilter));
            }
        }
예제 #22
0
    public override TokenStream TokenStream
        (string fieldName, System.IO.TextReader reader)
    {
        //create the tokenizer
        TokenStream result = new StandardTokenizer(reader);

        //add in filters
        // first normalize the StandardTokenizer
        result = new StandardFilter(result);
        // makes sure everything is lower case
        result = new LowerCaseFilter(result);
        // use the default list of Stop Words, provided by the StopAnalyzer class.
        result = new StopFilter(result, StopAnalyzer.ENGLISH_STOP_WORDS);
        // injects the synonyms.
        result = new SynonymFilter(result, SynonymEngine);
        //return the built token stream.
        return(result);
    }
예제 #23
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>,
        ///         <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="SnowballFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);

            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
            {
                result = new ElisionFilter(result, DEFAULT_ARTICLES);
            }
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new SnowballFilter(result, new CatalanStemmer());
            return(new TokenStreamComponents(source, result));
        }
예제 #24
0
        /// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
        /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
        /// </summary>
        public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
        {
            StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym);

            tokenStream.SetMaxTokenLength(maxTokenLength);
            TokenStream result = new StandardFilter(tokenStream);

            result = new LowerCaseFilter(result);
            if (useDefaultStopPositionIncrements)
            {
                result = new StopFilter(result, stopSet);
            }
            else
            {
                result = new StopFilter(enableStopPositionIncrements, result, stopSet);
            }
            return(result);
        }
예제 #25
0
        /// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
        /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
        /// </summary>
        public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
        {
            TokenStream result;

            if (tagsMode)
            {
                result = new TagsTokenizer(reader);
            }
            else
            {
                result = new StandardTokenizer(reader);
            }

            result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            result = new StopFilter(result, stopSet);
            return(result);
        }
예제 #26
0
        /// <summary>
        /// Creates
        /// <see cref="TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> <see cref="TokenStreamComponents"/>
        ///         built from a <see cref="StandardTokenizer"/> filtered with
        ///         <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>,
        ///         and <see cref="CzechStemFilter"/> (only if version is >= LUCENE_31). If
        ///         a version is >= LUCENE_31 and a stem exclusion set is provided via
        ///         <see cref="CzechAnalyzer(LuceneVersion, CharArraySet, CharArraySet)"/> a
        ///         <see cref="SetKeywordMarkerFilter"/> is added before
        ///         <see cref="CzechStemFilter"/>. </returns>

        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
            TokenStream result = new StandardFilter(m_matchVersion, source);

            result = new LowerCaseFilter(m_matchVersion, result);
            result = new StopFilter(m_matchVersion, result, m_stopwords);
#pragma warning disable 612, 618
            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                if (this.stemExclusionTable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, stemExclusionTable);
                }
                result = new CzechStemFilter(result);
            }
            return(new TokenStreamComponents(source, result));
        }
예제 #27
0
        public MySearch(string indexPath)
        {
            //_analyzer = new EnhEnglishAnalyzer(MATCH_LUCENE_VERSION);

            _analyzer = new MultiFieldAnalyzerWrapper(
                defaultAnalyzer: new EnhEnglishAnalyzer(MATCH_LUCENE_VERSION, true),
                new[]
            {
                (
                    new[] { "genre", "year" },
                    Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
                {
                    var source = new KeywordTokenizer(reader);
                    TokenStream result = new ASCIIFoldingFilter(source);
                    result = new LowerCaseFilter(MATCH_LUCENE_VERSION, result);
                    return(new TokenStreamComponents(source, result));
                })
                )
            });
예제 #28
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="EnglishPossessiveFilter"/>,
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="PorterStemFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);

            // prior to this we get the classic behavior, standardfilter does it for us.
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
            {
                result = new EnglishPossessiveFilter(matchVersion, result);
            }
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (stemExclusionSet.Any())
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new PorterStemFilter(result);
            return(new TokenStreamComponents(source, result));
        }
예제 #29
0
        /// <summary>Constructs a <see cref="StandardTokenizer" /> filtered by a <see cref="StandardFilter" />
        ///, a <see cref="LowerCaseFilter" /> and a <see cref="StopFilter" />.
        /// </summary>
        public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
        {
            StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader);
            TokenStream       result      = new StandardFilter(tokenStream);

            result = new LowerCaseFilter(result);
            if (stopSet != null)
            {
                result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet);
            }

            //Now, our Stemming filter goes here
            result = new BestBetsWordFormsFilter(result);

            //This will remove duplicate keywords - bad for best bets/term count matching
            result = new RemoveDuplicatesTokenFilter(result);

            return(result);
        }
예제 #30
0
        /// <summary>
        /// Creates
        /// <see cref="TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> <see cref="TokenStreamComponents"/>
        ///         built from a <see cref="StandardTokenizer"/> filtered with
        ///         <see cref="StandardFilter"/>, <see cref="ElisionFilter"/>,
        ///         <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>,
        ///         <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided, and <see cref="FrenchLightStemFilter"/> </returns>
        ///
        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
                TokenStream result = new StandardFilter(m_matchVersion, source);
                result = new ElisionFilter(result, DEFAULT_ARTICLES);
                result = new LowerCaseFilter(m_matchVersion, result);
                result = new StopFilter(m_matchVersion, result, m_stopwords);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
#pragma warning disable 612, 618
                if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
#pragma warning restore 612, 618
                {
                    result = new FrenchLightStemFilter(result);
                }
                else
                {
                    result = new SnowballFilter(result, new Tartarus.Snowball.Ext.FrenchStemmer());
                }
                return(new TokenStreamComponents(source, result));
            }
            else
            {
                Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
                TokenStream result = new StandardFilter(m_matchVersion, source);
                result = new StopFilter(m_matchVersion, result, m_stopwords);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
#pragma warning disable 612, 618
                result = new FrenchStemFilter(result);
#pragma warning restore 612, 618
                // Convert to lowercase after stemming!
                return(new TokenStreamComponents(source, new LowerCaseFilter(m_matchVersion, result)));
            }
        }
예제 #31
0
        // the ordering of these filters is important!
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            if (string.Equals("MilitaryIDNumber", fieldName))
            {
                TokenStream result = new WhitespaceTokenizer(reader);
                result = new LowerCaseFilter(result);
                result = new ASCIIFoldingFilter(result);
                result = new AlphaNumericFilter(result);  // behaves weirdly when used on Name field

                // during indexing, we will encounter some of the following extraneous text we don't care about.
                string[] stopWords = new string[] { "", "formerly", "or", "former", "pir", "tbc", "id", "pnc" };
                return(new StopFilter(false, result, new CharArraySet(stopWords, true), true));
            }
            else
            {
                TokenStream result = new AlphaNumericTokenizer(reader);
                result = new LowerCaseFilter(result);
                return(new ASCIIFoldingFilter(result));
            }
        }
예제 #32
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new ListMultiValueCharTokenizer(reader);

            //add in filters
            // first normalize the StandardTokenizer
            //result = new StandardFilter(result);

            // makes sure everything is lower case
            result = new LowerCaseFilter(result);

            // use the default list of Stop Words, provided by the StopAnalyzer class.
            //result = new StopFilter(result,new string[]{"קבוצת"});

            // injects the synonyms.
            //result = new SynonymFilter(result, SynonymEngine);

            //return the built token stream.
            return(result);
        }
예제 #33
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_29, reader);

            //result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            if (STOP_WORDS != null)
            {
                result = new StopFilter(false, result, STOP_WORDS);
            }
            result = new ASCIIFoldingFilter(result);

            // we are using a distinct version of the Spanish stemmer, called Spanish2
            // Please check if this class can be found in the Snowball library, the relative path
            // should be: Snowball\SF\Snowball\Ext\
            // just in case, I would leave a copy of this class in this project
            result = new SnowballFilter(result, "Spanish");

            return(result);
        }
예제 #34
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new HebrewTokenizer(reader, PrefixTree);

            // Niqqud normalization
            result = new NiqqudFilter(result);

            // TODO: should we ignoreCase in StopFilter?
            result = new StopFilter(enableStopPositionIncrements, result, STOP_WORDS_SET);

            // TODO: Apply LowerCaseFilter to NonHebrew tokens only
            result = new LowerCaseFilter(result);

            if (suffixByTokenType != null && suffixByTokenType.Count > 0)
                result = new AddSuffixFilter(result, suffixByTokenType);

            return result;
        }
예제 #35
0
	  /// <summary>
	  /// Creates a
	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
	  /// </summary>
	  /// <returns> A
	  ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  ///         built from an <seealso cref="StandardTokenizer"/> filtered with
	  ///         <seealso cref="StandardFilter"/>, <seealso cref="SoraniNormalizationFilter"/>, 
	  ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
	  ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
	  ///         provided and <seealso cref="SoraniStemFilter"/>. </returns>
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		Tokenizer source = new StandardTokenizer(matchVersion, reader);
		TokenStream result = new StandardFilter(matchVersion, source);
		result = new SoraniNormalizationFilter(result);
		result = new LowerCaseFilter(matchVersion, result);
		result = new StopFilter(matchVersion, result, stopwords);
		if (!stemExclusionSet.Empty)
		{
		  result = new SetKeywordMarkerFilter(result, stemExclusionSet);
		}
		result = new SoraniStemFilter(result);
		return new TokenStreamComponents(source, result);
	  }
예제 #36
0
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
		if (matchVersion.onOrAfter(Version.LUCENE_36))
		{
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		  Tokenizer source = new StandardTokenizer(matchVersion, reader);
		  // run the widthfilter first before bigramming, it sometimes combines characters.
		  TokenStream result = new CJKWidthFilter(source);
		  result = new LowerCaseFilter(matchVersion, result);
		  result = new CJKBigramFilter(result);
		  return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
		}
		else
		{
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new CJKTokenizer(reader);
		  Tokenizer source = new CJKTokenizer(reader);
		  return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
		}
	  }
 public override TokenStream TokenStream(String fieldName, TextReader reader) {
     TokenStream stream = new StandardTokenizer(LuceneVersion.LUCENE_29, reader);
     stream = new LowerCaseFilter(stream);
     stream = new HunspellStemFilter(stream, _dictionary);
     return stream;
 }
 public override TokenStream TokenStream(string fieldName, TextReader reader)
 {
     TokenStream result = new PanGuTokenizer(reader, _OriginalResult, _options, _parameters);
     result = new LowerCaseFilter(result);
     return result;
 }
예제 #39
0
	  /// <summary>
	  /// Creates
	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
	  /// </summary>
	  /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  ///         built from a <seealso cref="StandardTokenizer"/> filtered with
	  ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StandardFilter"/>, <seealso cref="StopFilter"/>
	  ///         , and <seealso cref="BrazilianStemFilter"/>. </returns>
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
		Tokenizer source = new StandardTokenizer(matchVersion, reader);
		TokenStream result = new LowerCaseFilter(matchVersion, source);
		result = new StandardFilter(matchVersion, result);
		result = new StopFilter(matchVersion, result, stopwords);
		if (excltable != null && !excltable.Empty)
		{
		  result = new SetKeywordMarkerFilter(result, excltable);
		}
		return new TokenStreamComponents(source, new BrazilianStemFilter(result));
	  }
예제 #40
0
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from a <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided, <seealso cref="GermanNormalizationFilter"/> and <seealso cref="GermanLightStemFilter"/> </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            result = new SetKeywordMarkerFilter(result, exclusionSet);
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
            {
                result = new GermanNormalizationFilter(result);
                result = new GermanLightStemFilter(result);
            }
            else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                result = new SnowballFilter(result, new German2Stemmer());
            }
            else
            {
                result = new GermanStemFilter(result);
            }
            return new TokenStreamComponents(source, result);
        }
예제 #41
0
 /**
  * Creates a {@link TokenStream} which tokenizes all the text in the provided
  * {@link Reader}.
  * 
  * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
  *         filtered with {@link LowerCaseFilter}, 
  *         {@link ArabicNormalizationFilter},
  *         {@link PersianNormalizationFilter} and Persian Stop words
  */
 public override TokenStream TokenStream(String fieldName, TextReader reader)
 {
     TokenStream result = new ArabicLetterTokenizer(reader);
     result = new LowerCaseFilter(result);
     result = new ArabicNormalizationFilter(result);
     /* additional persian-specific normalization */
     result = new PersianNormalizationFilter(result);
     /*
      * the order here is important: the stopword list is normalized with the
      * above!
      */
     result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                             result, stoptable);
     return result;
 }
예제 #42
0
	  /// <summary>
	  /// Creates a
	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
	  /// </summary>
	  /// <returns> A
	  ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  ///         built from an <seealso cref="StandardTokenizer"/> filtered with
	  ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, 
	  ///         <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
	  ///         provided and <seealso cref="SnowballFilter"/>. </returns>
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		Tokenizer source = new StandardTokenizer(matchVersion, reader);
		TokenStream result = new StandardFilter(matchVersion, source);
		if (matchVersion.onOrAfter(Version.LUCENE_36))
		{
		  result = new ElisionFilter(result, DEFAULT_ARTICLES);
		}
		result = new LowerCaseFilter(matchVersion, result);
		result = new StopFilter(matchVersion, result, stopwords);
		if (!stemExclusionSet.Empty)
		{
		  result = new SetKeywordMarkerFilter(result, stemExclusionSet);
		}
		result = new SnowballFilter(result, new CatalanStemmer());
		return new TokenStreamComponents(source, result);
	  }
예제 #43
0
	  /// <summary>
	  /// Creates
	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
	  /// </summary>
	  /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  ///         built from a <seealso cref="StandardTokenizer"/> filtered with
	  ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>,
	  ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>,
	  ///         <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
	  ///         provided, and <seealso cref="FrenchLightStemFilter"/> </returns>
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
		if (matchVersion.onOrAfter(Version.LUCENE_31))
		{
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		  Tokenizer source = new StandardTokenizer(matchVersion, reader);
		  TokenStream result = new StandardFilter(matchVersion, source);
		  result = new ElisionFilter(result, DEFAULT_ARTICLES);
		  result = new LowerCaseFilter(matchVersion, result);
		  result = new StopFilter(matchVersion, result, stopwords);
		  if (!excltable.Empty)
		  {
			result = new SetKeywordMarkerFilter(result, excltable);
		  }
		  if (matchVersion.onOrAfter(Version.LUCENE_36))
		  {
			result = new FrenchLightStemFilter(result);
		  }
		  else
		  {
			result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
		  }
		  return new TokenStreamComponents(source, result);
		}
		else
		{
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		  Tokenizer source = new StandardTokenizer(matchVersion, reader);
		  TokenStream result = new StandardFilter(matchVersion, source);
		  result = new StopFilter(matchVersion, result, stopwords);
		  if (!excltable.Empty)
		  {
			result = new SetKeywordMarkerFilter(result, excltable);
		  }
		  result = new FrenchStemFilter(result);
		  // Convert to lowercase after stemming!
		  return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
		}
	  }
예제 #44
0
		/// <summary>Constructs a <see cref="StandardTokenizer" /> filtered by a <see cref="StandardFilter" />
		///, a <see cref="LowerCaseFilter" /> and a <see cref="StopFilter" />. 
		/// </summary>
		public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
		{
			StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader);
			tokenStream.MaxTokenLength = maxTokenLength;
			TokenStream result = new StandardFilter(tokenStream);
			result = new LowerCaseFilter(result);
			result = new StopFilter(enableStopPositionIncrements, result, stopSet);
			return result;
		}
예제 #45
0
	  /// <summary>
	  /// Creates
	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
	  /// </summary>
	  /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  ///         built from a <seealso cref="StandardTokenizer"/> filtered with
	  ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="ThaiWordFilter"/>, and
	  ///         <seealso cref="StopFilter"/> </returns>
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
		if (matchVersion.onOrAfter(Version.LUCENE_48))
		{
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new ThaiTokenizer(reader);
		  Tokenizer source = new ThaiTokenizer(reader);
		  TokenStream result = new LowerCaseFilter(matchVersion, source);
		  result = new StopFilter(matchVersion, result, stopwords);
		  return new TokenStreamComponents(source, result);
		}
		else
		{
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		  Tokenizer source = new StandardTokenizer(matchVersion, reader);
		  TokenStream result = new StandardFilter(matchVersion, source);
		  if (matchVersion.onOrAfter(Version.LUCENE_31))
		  {
			result = new LowerCaseFilter(matchVersion, result);
		  }
		  result = new ThaiWordFilter(matchVersion, result);
		  return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
		}
	  }