/* * Returns a (possibly reused) {@link TokenStream} which tokenizes all the * text in the provided {@link Reader}. * * @return A {@link TokenStream} built from a {@link StandardTokenizer} * filtered with {@link StandardFilter}, {@link StopFilter}, * and {@link DutchStemFilter} */ public override TokenStream ReusableTokenStream(String fieldName, TextReader reader) { if (overridesTokenStreamMethod) { // LUCENE-1678: force fallback to tokenStream() if we // have been subclassed and that subclass overrides // tokenStream but not reusableTokenStream return(TokenStream(fieldName, reader)); } SavedStreams streams = (SavedStreams)PreviousTokenStream; if (streams == null) { streams = new SavedStreams(); streams.source = new StandardTokenizer(matchVersion, reader); streams.result = new StandardFilter(streams.source); streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), streams.result, stoptable); streams.result = new DutchStemFilter(streams.result, excltable, stemdict); PreviousTokenStream = streams; } else { streams.source.Reset(reader); } return(streams.result); }
/* Returns a (possibly reused) {@link StandardTokenizer} filtered by a * {@link StandardFilter}, a {@link LowerCaseFilter}, * a {@link StopFilter}, and a {@link SnowballFilter} */ public override TokenStream ReusableTokenStream(String fieldName, TextReader reader) { if (overridesTokenStreamMethod) { // LUCENE-1678: force fallback to tokenStream() if we // have been subclassed and that subclass overrides // tokenStream but not reusableTokenStream return(TokenStream(fieldName, reader)); } SavedStreams streams = (SavedStreams)PreviousTokenStream; if (streams == null) { streams = new SavedStreams(); streams.source = new StandardTokenizer(matchVersion, reader); streams.result = new StandardFilter(streams.source); streams.result = new LowerCaseFilter(streams.result); if (stopSet != null) { streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), streams.result, stopSet); } streams.result = new BestBetsWordFormsFilter(streams.result); //This will remove duplicate keywords - bad for best bets/term count matching streams.result = new RemoveDuplicatesTokenFilter(streams.result); PreviousTokenStream = streams; } else { streams.source.Reset(reader); } return(streams.result); }
public override TokenStream ReusableTokenStream(string fieldName, TextReader reader) { if (overridesTokenStreamMethod) { return(TokenStream(fieldName, reader)); } var savedStreams = (SavedStreams)PreviousTokenStream; if (savedStreams == null) { savedStreams = new SavedStreams { Source = new StandardTokenizer(MatchVersion, reader) }; savedStreams.Result = new StandardFilter(savedStreams.Source); savedStreams.Result = new LowerCaseFilter(savedStreams.Result); // TODO: Lucene.Net.Analysis.Compound.HyphenationCompoundWordTokenFilter savedStreams.Result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(MatchVersion), savedStreams.Result, StopTable); savedStreams.Result = new DanishStemFilter(savedStreams.Result, ExclusionTable); PreviousTokenStream = savedStreams; } else { savedStreams.Source.Reset(reader); } return(savedStreams.Result); }
public HjsStandardAnalyzer(Lucene.Net.Util.Version matchVersion, ISet <string> sws) : base(matchVersion, sws) { VERSION = matchVersion; StopSet = sws; enableSPI = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); }
/* * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text * in the provided {@link Reader}. * * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer} * filtered with {@link LowerCaseFilter}, * {@link ArabicNormalizationFilter}, * {@link PersianNormalizationFilter} and Persian Stop words */ public override TokenStream ReusableTokenStream(String fieldName, TextReader reader) { SavedStreams streams = (SavedStreams)PreviousTokenStream; if (streams == null) { streams = new SavedStreams(); streams.source = new ArabicLetterTokenizer(reader); streams.result = new LowerCaseFilter(streams.source); streams.result = new ArabicNormalizationFilter(streams.result); /* additional persian-specific normalization */ streams.result = new PersianNormalizationFilter(streams.result); /* * the order here is important: the stopword list is normalized with the * above! */ streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), streams.result, stoptable); PreviousTokenStream = streams; } else { streams.source.Reset(reader); } return(streams.result); }
public override TokenStream ReusableTokenStream(String fieldName, TextReader reader) { if (overridesTokenStreamMethod) { // LUCENE-1678: force fallback to tokenStream() if we // have been subclassed and that subclass overrides // tokenStream but not reusableTokenStream return(TokenStream(fieldName, reader)); } SavedStreams streams = (SavedStreams)PreviousTokenStream; if (streams == null) { streams = new SavedStreams(); streams.source = new StandardTokenizer(matchVersion, reader); streams.result = new StandardFilter(streams.source); streams.result = new ThaiWordFilter(streams.result); streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), streams.result, StopAnalyzer.ENGLISH_STOP_WORDS_SET); PreviousTokenStream = streams; } else { streams.source.Reset(reader); streams.result.Reset(); // reset the ThaiWordFilter's state } return(streams.result); }
/* * Creates a token stream that tokenizes the given string into token terms * (aka words). * * @param fieldName * the name of the field to tokenize (currently ignored). * @param text * the string to tokenize * @return a new token stream */ public TokenStream TokenStream(String fieldName, String text) { // Ideally the Analyzer superclass should have a method with the same signature, // with a default impl that simply delegates to the StringReader flavour. if (text == null) { throw new ArgumentException("text must not be null"); } TokenStream stream; if (Regex == NON_WORD_PATTERN) { // fast path stream = new FastStringTokenizer(text, true, toLowerCase, stopWords); } else if (Regex == WHITESPACE_PATTERN) { // fast path stream = new FastStringTokenizer(text, false, toLowerCase, stopWords); } else { stream = new RegexTokenizer(text, Regex, toLowerCase); if (stopWords != null) { stream = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), stream, stopWords); } } return(stream); }
/// <summary>Builds an analyzer with the given stop words.</summary> /// <param name="matchVersion">Lucene version to match See <see cref="Version">above</see> /> /// /// </param> /// <param name="stopWords">stop words /// </param> public StandardAnalyzer(Version matchVersion, ISet <string> stopWords) { stopSet = stopWords; SetOverridesTokenStreamMethod <StandardAnalyzer>(); enableStopPositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); replaceInvalidAcronym = matchVersion.OnOrAfter(Version.LUCENE_24); this.matchVersion = matchVersion; }
/// <summary> /// 创建分析器 /// </summary> /// <param name="language">语言</param> /// <param name="useIndexSynonyms">true表示在创建索引时,将同义词,近义词,相关词存入索引;false表示不使用。</param> public AnalyzerBus(string language, bool useIndexSynonyms = false) { this._EnableStopPositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(global::Lucene.Net.Util.Version.LUCENE_30); this._Language = language; this._UseIndexSynonyms = useIndexSynonyms; this._SymbolAnalyzer = new SymbolAnalyzer(); this._Analyzer = AnalyzerDict.GetAnalyzer(language.ToUpper()); this._StopCharArraySet = StopWord.StopWordList; }
/* * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. * * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with * {@link GreekLowerCaseFilter} and {@link StopFilter} */ public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new StandardTokenizer(matchVersion, reader); result = new GreekLowerCaseFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); return(result); }
/* * Creates a {@link TokenStream} which tokenizes all the text in the * provided {@link Reader}. * * @return A {@link TokenStream} built from a {@link StandardTokenizer} * filtered with {@link StandardFilter}, {@link StopFilter}, * and {@link DutchStemFilter} */ public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new StandardTokenizer(matchVersion, reader); result = new StandardFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stoptable); result = new DutchStemFilter(result, excltable, stemdict); return(result); }
public override TokenStream TokenStream(string fieldname, TextReader reader) { TokenStream result = new PersianTokenizer(reader); result = new LowerCaseFilter(result); result = new PersianNormalizationFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(_version), result, _stoptable); //result = new PersianLemmatizationFilter(result); return(result); }
public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream ts = new StandardTokenizer(matchVersion, reader); ts = new StandardFilter(ts); ts = new ThaiWordFilter(ts); ts = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET); return(ts); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream stream = base.TokenStream(fieldName, reader); stream = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(Version.LUCENE_30), stream, STOP_WORDS_SET); stream = new SlovakNounFilter(stream); stream = new LowerCaseFilter(stream); stream = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(Version.LUCENE_30), stream, STOP_WORDS_SET); return(stream); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = new StandardTokenizer(this.matchVersion, reader); result = new LowerCaseFilter(result); result = new StandardFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(this.matchVersion), result, this.stoptable); result = new BrazilianStemFilterCustom(result, this.excltable); return(result); }
public override TokenStream TokenStream(string fieldName, TextReader reader) { return (new DanishStemFilter( new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(MatchVersion), new LowerCaseFilter( new StandardFilter( new StandardTokenizer(MatchVersion, reader))), StopTable), ExclusionTable)); }
//DIGY ///** // * Builds an analyzer with the given stop words. Lines can be commented out using {@link #STOPWORDS_COMMENT} // * // * @deprecated Use {@link #ArabicAnalyzer(Version, File)} instead // */ //public ArabicAnalyzer(File stopwords) //{ // this(Version.LUCENE_24, stopwords); //} ///** // * Builds an analyzer with the given stop words. Lines can be commented out using {@link #STOPWORDS_COMMENT} // */ //public ArabicAnalyzer(Version matchVersion, File stopwords) //{ // stoptable = WordlistLoader.getWordSet(stopwords, STOPWORDS_COMMENT); // this.matchVersion = matchVersion; //} /** * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. * * @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with * {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter} * and {@link ArabicStemFilter}. */ public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream result = new ArabicLetterTokenizer(reader); result = new LowerCaseFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stoptable); result = new ArabicNormalizationFilter(result); result = new ArabicStemFilter(result); return(result); }
/* * Creates a {@link TokenStream} which tokenizes all the text in the provided * {@link Reader}. * * @return A {@link TokenStream} built from a {@link StandardTokenizer} * filtered with {@link StandardFilter}, {@link StopFilter}, * {@link FrenchStemFilter} and {@link LowerCaseFilter} */ public override sealed TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new StandardTokenizer(matchVersion, reader); result = new StandardFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stoptable); result = new FrenchStemFilter(result, excltable); // Convert to lowercase after stemming! result = new LowerCaseFilter(result); return(result); }
public DotJemAnalyzer(Version matchVersion, IIndexConfiguration configuration = null, ISet <string> stopwords = null) { MaxTokenLength = byte.MaxValue; enableStopPositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); replaceInvalidAcronym = matchVersion.OnOrAfter(Version.LUCENE_24); this.stopSet = stopwords ?? StopAnalyzer.ENGLISH_STOP_WORDS_SET; this.matchVersion = matchVersion; this.configuration = configuration; }
/// <summary>Constructs a <see cref="StandardTokenizer"/> filtered by a {@link /// StandardFilter}, a <see cref="LowerCaseFilter"/> and a <see cref="StopFilter"/>. /// </summary> public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(matchVersion, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); if (stopSet != null) { result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); } result = new SnowballFilter(result, name); return(result); }
/* * Creates a {@link TokenStream} which tokenizes all the text in the provided * {@link Reader}. * * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer} * filtered with {@link LowerCaseFilter}, * {@link ArabicNormalizationFilter}, * {@link PersianNormalizationFilter} and Persian Stop words */ public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new ArabicLetterTokenizer(reader); result = new LowerCaseFilter(result); result = new ArabicNormalizationFilter(result); /* additional persian-specific normalization */ result = new PersianNormalizationFilter(result); /* * the order here is important: the stopword list is normalized with the * above! */ result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stoptable); return(result); }
public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result; try { result = _delegate.ReusableTokenStream(fieldName, reader); } catch (IOException) { result = _delegate.TokenStream(fieldName, reader); } var stopWords = stopWordsPerField[fieldName]; if (stopWords != null) { result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopWords); } return(result); }
/// <summary>Constructs a <see cref="StandardTokenizer" /> filtered by a <see cref="StandardFilter" /> ///, a <see cref="LowerCaseFilter" /> and a <see cref="StopFilter" />. /// </summary> public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) { StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(tokenStream); result = new LowerCaseFilter(result); if (stopSet != null) { result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); } //Now, our Stemming filter goes here result = new BestBetsWordFormsFilter(result); //This will remove duplicate keywords - bad for best bets/term count matching result = new RemoveDuplicatesTokenFilter(result); return(result); }
/* * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text * in the provided {@link Reader}. * * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with * {@link GreekLowerCaseFilter} and {@link StopFilter} */ public override TokenStream ReusableTokenStream(String fieldName, TextReader reader) { SavedStreams streams = (SavedStreams)PreviousTokenStream; if (streams == null) { streams = new SavedStreams(); streams.source = new StandardTokenizer(matchVersion, reader); streams.result = new GreekLowerCaseFilter(streams.source); streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), streams.result, stopSet); PreviousTokenStream = streams; } else { streams.source.Reset(reader); } return(streams.result); }
/* * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text * in the provided {@link Reader}. * * @param fieldName lucene field name * @param reader Input {@link Reader} * @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with * {@link StopFilter} */ public override sealed TokenStream ReusableTokenStream(String fieldName, TextReader reader) { /* tokenStream() is final, no back compat issue */ SavedStreams streams = (SavedStreams)PreviousTokenStream; if (streams == null) { streams = new SavedStreams(); streams.source = new CJKTokenizer(reader); streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), streams.source, stopTable); PreviousTokenStream = streams; } else { streams.source.Reset(reader); } return(streams.result); }
/** * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text * in the provided {@link Reader}. * * @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with * {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter} * and {@link ArabicStemFilter}. */ public override TokenStream ReusableTokenStream(string fieldName, TextReader reader) { SavedStreams streams = (SavedStreams)GetPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); streams.Source = new ArabicLetterTokenizer(reader); streams.Result = new LowerCaseFilter(streams.Source); streams.Result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), streams.Result, stoptable); streams.Result = new ArabicNormalizationFilter(streams.Result); streams.Result = new ArabicStemFilter(streams.Result); SetPreviousTokenStream(streams); } else { streams.Source.Reset(reader); } return(streams.Result); }
/* * Returns a (possibly reused) {@link TokenStream} which tokenizes all the * text in the provided {@link Reader}. * * @return A {@link TokenStream} built from a {@link StandardTokenizer} * filtered with {@link StandardFilter}, {@link StopFilter}, * {@link FrenchStemFilter} and {@link LowerCaseFilter} */ public override TokenStream ReusableTokenStream(String fieldName, TextReader reader) { SavedStreams streams = (SavedStreams)PreviousTokenStream; if (streams == null) { streams = new SavedStreams(); streams.source = new StandardTokenizer(matchVersion, reader); streams.result = new StandardFilter(streams.source); streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), streams.result, stoptable); streams.result = new FrenchStemFilter(streams.result, excltable); // Convert to lowercase after stemming! streams.result = new LowerCaseFilter(streams.result); PreviousTokenStream = streams; } else { streams.source.Reset(reader); } return(streams.result); }
/* * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text * in the provided {@link Reader}. * * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and * {@link BrazilianStemFilter}. */ public override TokenStream ReusableTokenStream(string fieldName, TextReader reader) { var streams = (SavedStreams)this.PreviousTokenStream; if (streams == null) { streams = new SavedStreams(); streams.Source = new StandardTokenizer(this.matchVersion, reader); streams.Result = new LowerCaseFilter(streams.Source); streams.Result = new StandardFilter(streams.Result); streams.Result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(this.matchVersion), streams.Result, this.stoptable); streams.Result = new BrazilianStemFilterCustom(streams.Result, this.excltable); this.PreviousTokenStream = streams; } else { streams.Source.Reset(reader); } return(streams.Result); }
//~ Methods ---------------------------------------------------------------- /// <summary> /// get token stream from input /// </summary> /// <param name="fieldName">lucene field name</param> /// <param name="reader">input reader</param> /// <returns>Token Stream</returns> public override sealed TokenStream TokenStream(String fieldName, TextReader reader) { return(new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), new CJKTokenizer(reader), stopTable)); }
public BulgarianAnalyzer(Version matchVersion, HashSet <string> stopwords) { this.stoptable = new HashSet <string>(CharArraySet.Copy(stopwords)); this.matchVersion = matchVersion; this.enableStopPositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); }