Example #1
0
        /*
         * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
         * text in the provided {@link Reader}.
         *
         * @return A {@link TokenStream} built from a {@link StandardTokenizer}
         *   filtered with {@link StandardFilter}, {@link StopFilter},
         *   and {@link DutchStemFilter}
         */
        public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
        {
            if (overridesTokenStreamMethod)
            {
                // LUCENE-1678: force fallback to tokenStream() if we
                // have been subclassed and that subclass overrides
                // tokenStream but not reusableTokenStream
                return(TokenStream(fieldName, reader));
            }

            SavedStreams streams = (SavedStreams)PreviousTokenStream;

            if (streams == null)
            {
                streams        = new SavedStreams();
                streams.source = new StandardTokenizer(matchVersion, reader);
                streams.result = new StandardFilter(streams.source);
                streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                                streams.result, stoptable);
                streams.result      = new DutchStemFilter(streams.result, excltable, stemdict);
                PreviousTokenStream = streams;
            }
            else
            {
                streams.source.Reset(reader);
            }
            return(streams.result);
        }
Example #2
0
        /* Returns a (possibly reused) {@link StandardTokenizer} filtered by a
         * {@link StandardFilter}, a {@link LowerCaseFilter},
         * a {@link StopFilter}, and a {@link SnowballFilter} */

        public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
        {
            if (overridesTokenStreamMethod)
            {
                // LUCENE-1678: force fallback to tokenStream() if we
                // have been subclassed and that subclass overrides
                // tokenStream but not reusableTokenStream
                return(TokenStream(fieldName, reader));
            }

            SavedStreams streams = (SavedStreams)PreviousTokenStream;

            if (streams == null)
            {
                streams        = new SavedStreams();
                streams.source = new StandardTokenizer(matchVersion, reader);
                streams.result = new StandardFilter(streams.source);
                streams.result = new LowerCaseFilter(streams.result);
                if (stopSet != null)
                {
                    streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                                    streams.result, stopSet);
                }
                streams.result = new BestBetsWordFormsFilter(streams.result);
                //This will remove duplicate keywords - bad for best bets/term count matching
                streams.result = new RemoveDuplicatesTokenFilter(streams.result);

                PreviousTokenStream = streams;
            }
            else
            {
                streams.source.Reset(reader);
            }
            return(streams.result);
        }
Example #3
0
        public override TokenStream ReusableTokenStream(string fieldName, TextReader reader)
        {
            if (overridesTokenStreamMethod)
            {
                return(TokenStream(fieldName, reader));
            }

            var savedStreams = (SavedStreams)PreviousTokenStream;

            if (savedStreams == null)
            {
                savedStreams = new SavedStreams {
                    Source = new StandardTokenizer(MatchVersion, reader)
                };
                savedStreams.Result = new StandardFilter(savedStreams.Source);
                savedStreams.Result = new LowerCaseFilter(savedStreams.Result);
                // TODO: Lucene.Net.Analysis.Compound.HyphenationCompoundWordTokenFilter
                savedStreams.Result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(MatchVersion), savedStreams.Result, StopTable);
                savedStreams.Result = new DanishStemFilter(savedStreams.Result, ExclusionTable);

                PreviousTokenStream = savedStreams;
            }
            else
            {
                savedStreams.Source.Reset(reader);
            }

            return(savedStreams.Result);
        }
Example #4
0
 public HjsStandardAnalyzer(Lucene.Net.Util.Version matchVersion, ISet <string> sws)
     : base(matchVersion, sws)
 {
     VERSION   = matchVersion;
     StopSet   = sws;
     enableSPI = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
 }
Example #5
0
        /*
         * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
         * in the provided {@link Reader}.
         *
         * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
         *         filtered with {@link LowerCaseFilter},
         *         {@link ArabicNormalizationFilter},
         *         {@link PersianNormalizationFilter} and Persian Stop words
         */
        public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
        {
            SavedStreams streams = (SavedStreams)PreviousTokenStream;

            if (streams == null)
            {
                streams        = new SavedStreams();
                streams.source = new ArabicLetterTokenizer(reader);
                streams.result = new LowerCaseFilter(streams.source);
                streams.result = new ArabicNormalizationFilter(streams.result);
                /* additional persian-specific normalization */
                streams.result = new PersianNormalizationFilter(streams.result);

                /*
                 * the order here is important: the stopword list is normalized with the
                 * above!
                 */
                streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                                streams.result, stoptable);
                PreviousTokenStream = streams;
            }
            else
            {
                streams.source.Reset(reader);
            }
            return(streams.result);
        }
Example #6
0
        public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
        {
            if (overridesTokenStreamMethod)
            {
                // LUCENE-1678: force fallback to tokenStream() if we
                // have been subclassed and that subclass overrides
                // tokenStream but not reusableTokenStream
                return(TokenStream(fieldName, reader));
            }

            SavedStreams streams = (SavedStreams)PreviousTokenStream;

            if (streams == null)
            {
                streams        = new SavedStreams();
                streams.source = new StandardTokenizer(matchVersion, reader);
                streams.result = new StandardFilter(streams.source);
                streams.result = new ThaiWordFilter(streams.result);
                streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                                streams.result, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
                PreviousTokenStream = streams;
            }
            else
            {
                streams.source.Reset(reader);
                streams.result.Reset(); // reset the ThaiWordFilter's state
            }
            return(streams.result);
        }
Example #7
0
        /*
         * Creates a token stream that tokenizes the given string into token terms
         * (aka words).
         *
         * @param fieldName
         *            the name of the field to tokenize (currently ignored).
         * @param text
         *            the string to tokenize
         * @return a new token stream
         */
        public TokenStream TokenStream(String fieldName, String text)
        {
            // Ideally the Analyzer superclass should have a method with the same signature,
            // with a default impl that simply delegates to the StringReader flavour.
            if (text == null)
            {
                throw new ArgumentException("text must not be null");
            }

            TokenStream stream;

            if (Regex == NON_WORD_PATTERN)
            { // fast path
                stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
            }
            else if (Regex == WHITESPACE_PATTERN)
            { // fast path
                stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
            }
            else
            {
                stream = new RegexTokenizer(text, Regex, toLowerCase);
                if (stopWords != null)
                {
                    stream = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), stream, stopWords);
                }
            }

            return(stream);
        }
 /// <summary>Builds an analyzer with the given stop words.</summary>
 /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> />
 ///
 /// </param>
 /// <param name="stopWords">stop words
 /// </param>
 public StandardAnalyzer(Version matchVersion, ISet <string> stopWords)
 {
     stopSet = stopWords;
     SetOverridesTokenStreamMethod <StandardAnalyzer>();
     enableStopPositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
     replaceInvalidAcronym        = matchVersion.OnOrAfter(Version.LUCENE_24);
     this.matchVersion            = matchVersion;
 }
Example #9
0
 /// <summary>
 /// 创建分析器
 /// </summary>
 /// <param name="language">语言</param>
 /// <param name="useIndexSynonyms">true表示在创建索引时,将同义词,近义词,相关词存入索引;false表示不使用。</param>
 public AnalyzerBus(string language, bool useIndexSynonyms = false)
 {
     this._EnableStopPositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(global::Lucene.Net.Util.Version.LUCENE_30);
     this._Language         = language;
     this._UseIndexSynonyms = useIndexSynonyms;
     this._SymbolAnalyzer   = new SymbolAnalyzer();
     this._Analyzer         = AnalyzerDict.GetAnalyzer(language.ToUpper());
     this._StopCharArraySet = StopWord.StopWordList;
 }
Example #10
0
        /*
         * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
         *
         * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
         *                  {@link GreekLowerCaseFilter} and {@link StopFilter}
         */
        public override TokenStream TokenStream(String fieldName, TextReader reader)
        {
            TokenStream result = new StandardTokenizer(matchVersion, reader);

            result = new GreekLowerCaseFilter(result);
            result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                    result, stopSet);
            return(result);
        }
Example #11
0
        /*
         * Creates a {@link TokenStream} which tokenizes all the text in the
         * provided {@link Reader}.
         *
         * @return A {@link TokenStream} built from a {@link StandardTokenizer}
         *   filtered with {@link StandardFilter}, {@link StopFilter},
         *   and {@link DutchStemFilter}
         */
        public override TokenStream TokenStream(String fieldName, TextReader reader)
        {
            TokenStream result = new StandardTokenizer(matchVersion, reader);

            result = new StandardFilter(result);
            result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                    result, stoptable);
            result = new DutchStemFilter(result, excltable, stemdict);
            return(result);
        }
Example #12
0
        public override TokenStream TokenStream(string fieldname, TextReader reader)
        {
            TokenStream result = new PersianTokenizer(reader);

            result = new LowerCaseFilter(result);
            result = new PersianNormalizationFilter(result);
            result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(_version), result, _stoptable);
            //result = new PersianLemmatizationFilter(result);
            return(result);
        }
Example #13
0
        public override TokenStream TokenStream(String fieldName, TextReader reader)
        {
            TokenStream ts = new StandardTokenizer(matchVersion, reader);

            ts = new StandardFilter(ts);
            ts = new ThaiWordFilter(ts);
            ts = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
            return(ts);
        }
Example #14
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            TokenStream stream = base.TokenStream(fieldName, reader);

            stream = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(Version.LUCENE_30), stream, STOP_WORDS_SET);
            stream = new SlovakNounFilter(stream);
            stream = new LowerCaseFilter(stream);
            stream = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(Version.LUCENE_30), stream, STOP_WORDS_SET);

            return(stream);
        }
Example #15
0
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            TokenStream result = new StandardTokenizer(this.matchVersion, reader);

            result = new LowerCaseFilter(result);
            result = new StandardFilter(result);
            result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(this.matchVersion), result, this.stoptable);
            result = new BrazilianStemFilterCustom(result, this.excltable);

            return(result);
        }
Example #16
0
 public override TokenStream TokenStream(string fieldName, TextReader reader)
 {
     return
         (new DanishStemFilter(
              new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(MatchVersion),
                             new LowerCaseFilter(
                                 new StandardFilter(
                                     new StandardTokenizer(MatchVersion, reader))),
                             StopTable),
              ExclusionTable));
 }
Example #17
0
        //DIGY
        ///**
        // * Builds an analyzer with the given stop words.  Lines can be commented out using {@link #STOPWORDS_COMMENT}
        // *
        // * @deprecated Use {@link #ArabicAnalyzer(Version, File)} instead
        // */
        //public ArabicAnalyzer(File stopwords)
        //{
        //    this(Version.LUCENE_24, stopwords);
        //}

        ///**
        // * Builds an analyzer with the given stop words.  Lines can be commented out using {@link #STOPWORDS_COMMENT}
        // */
        //public ArabicAnalyzer(Version matchVersion, File stopwords)
        //{
        //    stoptable = WordlistLoader.getWordSet(stopwords, STOPWORDS_COMMENT);
        //    this.matchVersion = matchVersion;
        //}


        /**
         * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
         *
         * @return  A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
         *          {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
         *            and {@link ArabicStemFilter}.
         */
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            TokenStream result = new ArabicLetterTokenizer(reader);

            result = new LowerCaseFilter(result);
            result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stoptable);
            result = new ArabicNormalizationFilter(result);
            result = new ArabicStemFilter(result);

            return(result);
        }
Example #18
0
        /*
         * Creates a {@link TokenStream} which tokenizes all the text in the provided
         * {@link Reader}.
         *
         * @return A {@link TokenStream} built from a {@link StandardTokenizer}
         *         filtered with {@link StandardFilter}, {@link StopFilter},
         *         {@link FrenchStemFilter} and {@link LowerCaseFilter}
         */
        public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
        {
            TokenStream result = new StandardTokenizer(matchVersion, reader);

            result = new StandardFilter(result);
            result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                    result, stoptable);
            result = new FrenchStemFilter(result, excltable);
            // Convert to lowercase after stemming!
            result = new LowerCaseFilter(result);
            return(result);
        }
Example #19
0
        public DotJemAnalyzer(Version matchVersion, IIndexConfiguration configuration = null, ISet <string> stopwords = null)
        {
            MaxTokenLength = byte.MaxValue;

            enableStopPositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
            replaceInvalidAcronym        = matchVersion.OnOrAfter(Version.LUCENE_24);

            this.stopSet = stopwords ?? StopAnalyzer.ENGLISH_STOP_WORDS_SET;

            this.matchVersion  = matchVersion;
            this.configuration = configuration;
        }
Example #20
0
        /// <summary>Constructs a <see cref="StandardTokenizer"/> filtered by a {@link
        /// StandardFilter}, a <see cref="LowerCaseFilter"/> and a <see cref="StopFilter"/>.
        /// </summary>
        public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new StandardTokenizer(matchVersion, reader);

            result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            if (stopSet != null)
            {
                result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                        result, stopSet);
            }
            result = new SnowballFilter(result, name);
            return(result);
        }
Example #21
0
        /*
         * Creates a {@link TokenStream} which tokenizes all the text in the provided
         * {@link Reader}.
         *
         * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
         *         filtered with {@link LowerCaseFilter},
         *         {@link ArabicNormalizationFilter},
         *         {@link PersianNormalizationFilter} and Persian Stop words
         */
        public override TokenStream TokenStream(String fieldName, TextReader reader)
        {
            TokenStream result = new ArabicLetterTokenizer(reader);

            result = new LowerCaseFilter(result);
            result = new ArabicNormalizationFilter(result);
            /* additional persian-specific normalization */
            result = new PersianNormalizationFilter(result);

            /*
             * the order here is important: the stopword list is normalized with the
             * above!
             */
            result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                    result, stoptable);
            return(result);
        }
        public override TokenStream TokenStream(String fieldName, TextReader reader)
        {
            TokenStream result;

            try {
                result = _delegate.ReusableTokenStream(fieldName, reader);
            } catch (IOException) {
                result = _delegate.TokenStream(fieldName, reader);
            }
            var stopWords = stopWordsPerField[fieldName];

            if (stopWords != null)
            {
                result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                        result, stopWords);
            }
            return(result);
        }
Example #23
0
        /// <summary>Constructs a <see cref="StandardTokenizer" /> filtered by a <see cref="StandardFilter" />
        ///, a <see cref="LowerCaseFilter" /> and a <see cref="StopFilter" />.
        /// </summary>
        public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
        {
            StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader);
            TokenStream       result      = new StandardFilter(tokenStream);

            result = new LowerCaseFilter(result);
            if (stopSet != null)
            {
                result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet);
            }

            //Now, our Stemming filter goes here
            result = new BestBetsWordFormsFilter(result);

            //This will remove duplicate keywords - bad for best bets/term count matching
            result = new RemoveDuplicatesTokenFilter(result);

            return(result);
        }
Example #24
0
        /*
         * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
         * in the provided {@link Reader}.
         *
         * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
         *                  {@link GreekLowerCaseFilter} and {@link StopFilter}
         */
        public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
        {
            SavedStreams streams = (SavedStreams)PreviousTokenStream;

            if (streams == null)
            {
                streams        = new SavedStreams();
                streams.source = new StandardTokenizer(matchVersion, reader);
                streams.result = new GreekLowerCaseFilter(streams.source);
                streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                                streams.result, stopSet);
                PreviousTokenStream = streams;
            }
            else
            {
                streams.source.Reset(reader);
            }
            return(streams.result);
        }
Example #25
0
        /*
         * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
         * in the provided {@link Reader}.
         *
         * @param fieldName lucene field name
         * @param reader    Input {@link Reader}
         * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
         *    {@link StopFilter}
         */
        public override sealed TokenStream ReusableTokenStream(String fieldName, TextReader reader)
        {
            /* tokenStream() is final, no back compat issue */
            SavedStreams streams = (SavedStreams)PreviousTokenStream;

            if (streams == null)
            {
                streams        = new SavedStreams();
                streams.source = new CJKTokenizer(reader);
                streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                                streams.source, stopTable);
                PreviousTokenStream = streams;
            }
            else
            {
                streams.source.Reset(reader);
            }
            return(streams.result);
        }
Example #26
0
        /**
         * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
         * in the provided {@link Reader}.
         *
         * @return  A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with
         *            {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter}
         *            and {@link ArabicStemFilter}.
         */
        public override TokenStream ReusableTokenStream(string fieldName, TextReader reader)
        {
            SavedStreams streams = (SavedStreams)GetPreviousTokenStream();

            if (streams == null)
            {
                streams        = new SavedStreams();
                streams.Source = new ArabicLetterTokenizer(reader);
                streams.Result = new LowerCaseFilter(streams.Source);
                streams.Result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                                streams.Result, stoptable);
                streams.Result = new ArabicNormalizationFilter(streams.Result);
                streams.Result = new ArabicStemFilter(streams.Result);
                SetPreviousTokenStream(streams);
            }
            else
            {
                streams.Source.Reset(reader);
            }
            return(streams.Result);
        }
Example #27
0
        /*
         * Returns a (possibly reused) {@link TokenStream} which tokenizes all the
         * text in the provided {@link Reader}.
         *
         * @return A {@link TokenStream} built from a {@link StandardTokenizer}
         *         filtered with {@link StandardFilter}, {@link StopFilter},
         *         {@link FrenchStemFilter} and {@link LowerCaseFilter}
         */
        public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
        {
            SavedStreams streams = (SavedStreams)PreviousTokenStream;

            if (streams == null)
            {
                streams        = new SavedStreams();
                streams.source = new StandardTokenizer(matchVersion, reader);
                streams.result = new StandardFilter(streams.source);
                streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                                streams.result, stoptable);
                streams.result = new FrenchStemFilter(streams.result, excltable);
                // Convert to lowercase after stemming!
                streams.result      = new LowerCaseFilter(streams.result);
                PreviousTokenStream = streams;
            }
            else
            {
                streams.source.Reset(reader);
            }
            return(streams.result);
        }
Example #28
0
        /*
         * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
         * in the provided {@link Reader}.
         *
         * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
         *          {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
         *          {@link BrazilianStemFilter}.
         */

        public override TokenStream ReusableTokenStream(string fieldName, TextReader reader)
        {
            var streams = (SavedStreams)this.PreviousTokenStream;

            if (streams == null)
            {
                streams        = new SavedStreams();
                streams.Source = new StandardTokenizer(this.matchVersion, reader);

                streams.Result           = new LowerCaseFilter(streams.Source);
                streams.Result           = new StandardFilter(streams.Result);
                streams.Result           = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(this.matchVersion), streams.Result, this.stoptable);
                streams.Result           = new BrazilianStemFilterCustom(streams.Result, this.excltable);
                this.PreviousTokenStream = streams;
            }
            else
            {
                streams.Source.Reset(reader);
            }

            return(streams.Result);
        }
Example #29
0
        //~ Methods ----------------------------------------------------------------

        /// <summary>
        /// get token stream from input
        /// </summary>
        /// <param name="fieldName">lucene field name</param>
        /// <param name="reader">input reader</param>
        /// <returns>Token Stream</returns>
        public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
        {
            return(new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                  new CJKTokenizer(reader), stopTable));
        }
 public BulgarianAnalyzer(Version matchVersion, HashSet <string> stopwords)
 {
     this.stoptable    = new HashSet <string>(CharArraySet.Copy(stopwords));
     this.matchVersion = matchVersion;
     this.enableStopPositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
 }