// other StopFilter functionality is already tested by TestStopAnalyzer //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testExactCase() throws java.io.IOException public virtual void testExactCase() { StringReader reader = new StringReader("Now is The Time"); CharArraySet stopWords = new CharArraySet(TEST_VERSION_CURRENT, asSet("is", "the", "Time"), false); TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords); assertTokenStreamContents(stream, new string[] {"Now", "The"}); }
protected override TokenStreamComponents createComponents(string field, java.io.Reader reader) { var tokenizer = new PathTokenizer(reader); TokenStream tokenStream = new StandardFilter(tokenizer); tokenStream = new LowerCaseFilter(tokenStream); tokenStream = new StopFilter(tokenStream, StandardAnalyzer.STOP_WORDS_SET); return new TokenStreamComponents(tokenizer, tokenStream); }
public virtual void TestStopFilt() { StringReader reader = new StringReader("Now is The Time"); string[] stopWords = new string[] { "is", "the", "Time" }; CharArraySet stopSet = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, stopWords); TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet); AssertTokenStreamContents(stream, new string[] { "Now", "The" }); }
public override TokenStream TokenStream(string fieldname, TextReader reader) { TokenStream result = new StandardTokenizer(_version, reader); result = new LowerCaseFilter(result); result = new PersianNormalizationFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(_version), result, _stoptable); result = new PersianStemFilter(result); return result; }
public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream ts = new StandardTokenizer(matchVersion, reader); ts = new StandardFilter(ts); ts = new ThaiWordFilter(ts); ts = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET); return ts; }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="LowerCaseFilter"/>, <seealso cref="IndicNormalizationFilter"/>, /// <seealso cref="HindiNormalizationFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> /// if a stem exclusion set is provided, <seealso cref="HindiStemFilter"/>, and /// Hindi Stop words </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source; if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) { source = new StandardTokenizer(matchVersion, reader); } else { source = new IndicTokenizer(matchVersion, reader); } TokenStream result = new LowerCaseFilter(matchVersion, source); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new IndicNormalizationFilter(result); result = new HindiNormalizationFilter(result); result = new StopFilter(matchVersion, result, stopwords); result = new HindiStemFilter(result); return(new TokenStreamComponents(source, result)); }
public ArrayList getKeywords(string result, string tipoAnalizador) { ArrayList ListStemsList = new ArrayList(); TokenStream tokenStream = new StandardTokenizer(new System.IO.StringReader(result)); tokenStream = new StandardFilter(tokenStream); //elimina los signos de puntuación tokenStream = new LowerCaseFilter(tokenStream); //convierte el contenido a minúsculas if (tipoAnalizador == "Español") { //filtrará el contenido con el listado de stopWords tokenStream = new StopFilter(tokenStream, StopFilter.MakeStopSet(SpanishAnalyzer.SPANISH_STOP_WORDS), true); //Operacion de lematización de la palabras //SpanishAnalyzer ansp = new SpanishAnalyzer(); //tokenStream = ansp.SpanishSteammer(tokenStream); } else { //filtrará el contenido con el listado de stopWords tokenStream = new StopFilter(tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS, true); //Operacion de lematización de la palabras //tokenStream = new PorterStemFilter(tokenStream); } string cadena = ""; string[] token; Lucene.Net.Analysis.Token current; while ((current = tokenStream.Next()) != null) { cadena = current.ToString(); token = cadena.Split(','); cadena = cadena.Substring(1, token[0].Length - 1); ListStemsList.Add(cadena); } return(ListStemsList); }
/// <summary> /// Returns a (possibly reused) <see cref="TokenStream"/> which tokenizes all the /// text in the provided <see cref="TextReader"/>. /// </summary> /// <returns> A <see cref="TokenStream"/> built from a <see cref="StandardTokenizer"/> /// filtered with <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>, /// <see cref="StopFilter"/>, <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is provided, /// <see cref="StemmerOverrideFilter"/>, and <see cref="SnowballFilter"/> </returns> protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader aReader) { #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { Tokenizer source = new StandardTokenizer(matchVersion, aReader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stoptable); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } if (stemdict != null) { result = new StemmerOverrideFilter(result, stemdict); } result = new SnowballFilter(result, new Tartarus.Snowball.Ext.DutchStemmer()); return(new TokenStreamComponents(source, result)); } else { Tokenizer source = new StandardTokenizer(matchVersion, aReader); TokenStream result = new StandardFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stoptable); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } #pragma warning disable 612, 618 result = new DutchStemFilter(result, origStemdict); #pragma warning restore 612, 618 return(new TokenStreamComponents(source, result)); } }
/** * Creates a token stream that tokenizes the given string into token terms * (aka words). * * @param fieldName * the name of the field to tokenize (currently ignored). * @param text * the string to tokenize * @return a new token stream */ public TokenStream TokenStream(String fieldName, String text) { // Ideally the Analyzer superclass should have a method with the same signature, // with a default impl that simply delegates to the StringReader flavour. if (text == null) throw new ArgumentException("text must not be null"); TokenStream stream; if (Regex == NON_WORD_PATTERN) { // fast path stream = new FastStringTokenizer(text, true, toLowerCase, stopWords); } else if (Regex == WHITESPACE_PATTERN) { // fast path stream = new FastStringTokenizer(text, false, toLowerCase, stopWords); } else { stream = new RegexTokenizer(text, Regex, toLowerCase); if (stopWords != null) stream = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), stream, stopWords); } return stream; }
public StandardAnalyzer(System.String[] stopwords, bool replaceInvalidAcronym) : this(Version.LUCENE_24, StopFilter.MakeStopSet(stopwords)) { this.replaceInvalidAcronym = replaceInvalidAcronym; }
/// <summary> /// Builds an analyzer which removes words in the provided array. /// </summary> /// <param name="stopWords">stop word array</param> public CJKAnalyzer(String[] stopWords) { stopTable = StopFilter.MakeStopSet(stopWords); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="EnglishPossessiveFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="PorterStemFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); // prior to this we get the classic behavior, standardfilter does it for us. #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { result = new EnglishPossessiveFilter(matchVersion, result); } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Any()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); }
private static void Initialize() { if (_initialized) { return; } lock (_lock) { if (_initialized) { return; } try { LuceneSection section = (LuceneSection)ConfigurationManager.GetSection("lucene.indexing"); _indexAllTypes = section.IndexAllTypes; _isActive = section.Active; if (_isActive == null || !_isActive.Value) { _initialized = true; return; } _fieldPrefix = section.Prefix; _luceneVersion = section.LuceneVersion; _includedTypes = new Dictionary <string, ContentTypeDocument>(); string[] strArray = new string[0]; var fieldAnalyzerWrapper = new PerFieldAnalyzerWrapper((Analyzer) new StandardAnalyzer(LuceneVersion, StopFilter.MakeStopSet(strArray))); if (!IndexAllTypes) { foreach (IncludedTypeElement typeSetting in section.IncludedTypes) { Type contentType = Type.GetType(typeSetting.Type, true, true); if (contentType == null) { continue; } if (!_includedTypes.TryGetValue(typeSetting.Name, out var tmp)) { var documentIndexModel = new ContentTypeDocument(); documentIndexModel.ContentType = contentType; documentIndexModel.IndexAllFields = typeSetting.IndexAllFields; _includedTypes.Add(typeSetting.Name, documentIndexModel); foreach (IncludedFieldElement fieldSetting in typeSetting.IncludedFields) { Type fieldType; if (string.IsNullOrEmpty(fieldSetting.Type)) { fieldType = typeof(DefaultComputedField); } else { fieldType = Type.GetType(fieldSetting.Type, true, true); } if (!typeof(IComputedField).IsAssignableFrom(fieldType)) { continue; } var instance = (IComputedField)Activator.CreateInstance(fieldType); Type analyzerType = Type.GetType(fieldSetting.Analyzer, true, true); if (!typeof(Analyzer).IsAssignableFrom(analyzerType)) { continue; } if (analyzerType == typeof(StandardAnalyzer)) { instance.Analyzer = new StandardAnalyzer(LuceneVersion, StopFilter.MakeStopSet(strArray)); } else { instance.Analyzer = (Analyzer)Activator.CreateInstance(analyzerType); } instance.Index = fieldSetting.Index; instance.Store = fieldSetting.Store; instance.Vector = fieldSetting.Vector; instance.DataType = fieldSetting.DataType; if (!documentIndexModel.IndexedFields.TryGetValue(fieldSetting.Name, out var tmp2)) { documentIndexModel.IndexedFields.Add(fieldSetting.Name, instance); fieldAnalyzerWrapper.AddAnalyzer(ContentIndexHelpers.GetIndexFieldName(fieldSetting.Name), instance.Analyzer); } } } } } _analyzer = fieldAnalyzerWrapper; AddDefaultFieldAnalyzer(); var shardingStrategy = section.Sharding?.Strategy; if (!string.IsNullOrEmpty(shardingStrategy)) { var shardingType = Type.GetType(shardingStrategy, true, true); LuceneContext.IndexShardingStrategy = (IIndexShardingStrategy)Activator.CreateInstance(shardingType); LuceneContext.IndexShardingStrategy.WarmupShards(); return; } Directory directory; var directoryConnectionString = ConfigurationManager.AppSettings["lucene:BlobConnectionString"] ?? "App_Data/My_Index"; var directoryContainerName = ConfigurationManager.AppSettings["lucene:ContainerName"] ?? "lucene"; var directoryType = (ConfigurationManager.AppSettings["lucene:DirectoryType"] ?? "Filesystem").ToLower(); switch (directoryType) { case Constants.ContainerType.Azure: var connectionString = directoryConnectionString; var containerName = directoryContainerName; var storageAccount = CloudStorageAccount.Parse(connectionString); var azureDir = new FastAzureDirectory(storageAccount, containerName, new RAMDirectory()); directory = azureDir; break; default: var folderPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, directoryConnectionString); var fsDirectory = FSDirectory.Open(folderPath); directory = fsDirectory; break; } _directory = directory; InitDirectory(_directory); } catch (Exception ex) { throw ex; } _initialized = true; } }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new StreamLemmasFilter(reader, hebMorphLemmatizer, lemmaFilter, alwaysSaveMarkedOriginal); // This stop filter is here temporarily, until HebMorph is smart enough to clear stop words // all by itself result = new StopFilter(enableStopPositionIncrements, result, STOP_WORDS_SET); return result; }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SnowballFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new DanishStemmer()); return new TokenStreamComponents(source, result); }
public SpanishAnalyzer() { stopTable = StopFilter.MakeStopSet(SPANISH_STOP_WORDS); }
/// <summary>Builds an analyzer with the given stop words. </summary> public StandardAnalyzer(System.String[] stopWords) { stopSet = StopFilter.MakeStopSet(stopWords); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>, /// <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <seealso cref="FrenchLightStemFilter"/> </returns> /// public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) #pragma warning restore 612, 618 { result = new FrenchLightStemFilter(result); } else { result = new SnowballFilter(result, new Tartarus.Snowball.Ext.FrenchStemmer()); } return new TokenStreamComponents(source, result); } else { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stopwords); if (excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } #pragma warning disable 612, 618 result = new FrenchStemFilter(result); #pragma warning restore 612, 618 // Convert to lowercase after stemming! return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result)); } }
/* * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. * * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with * {@link GreekLowerCaseFilter} and {@link StopFilter} */ public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new StandardTokenizer(matchVersion, reader); result = new GreekLowerCaseFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); return result; }
/** * Builds an analyzer with the given stop words. */ public ArabicAnalyzer(Version matchVersion, string[] stopwords) { stoptable = StopFilter.MakeStopSet(stopwords); this.matchVersion = matchVersion; }
/// <summary> /// Constructor that allows you to specify your stop words /// </summary> /// <param name="stopWords">Stopwords to use (lucene will not index these words) - should be all lowercase</param> public EnglishAnalyzer(IEnumerable <string> stopWords) { _words = StopFilter.MakeStopSet(LeoLuceneVersion.Version, stopWords.ToArray()); }
/// <summary> /// Constructs a <seealso cref="StandardTokenizer"/> filtered by a {@link /// StandardFilter}, a <seealso cref="LowerCaseFilter"/>, a <seealso cref="StopFilter"/>, /// and a <seealso cref="SnowballFilter"/> /// </summary> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, tokenizer); // remove the possessive 's for english stemmers if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && (name.Equals("English") || name.Equals("Porter") || name.Equals("Lovins"))) { result = new EnglishPossessiveFilter(result); } // Use a special lowercase filter for turkish, the stemmer expects it. if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && name.Equals("Turkish")) { result = new TurkishLowerCaseFilter(result); } else { result = new LowerCaseFilter(matchVersion, result); } if (stopSet != null) { result = new StopFilter(matchVersion, result, stopSet); } result = new SnowballFilter(result, name); return new TokenStreamComponents(tokenizer, result); }
public GermanAnalyzer(Version matchVersion, params string[] stopwords) : this(matchVersion, StopFilter.MakeStopSet(stopwords)) { }
/** * Creates a {@link TokenStream} which tokenizes all the text in the provided * {@link Reader}. * * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer} * filtered with {@link LowerCaseFilter}, * {@link ArabicNormalizationFilter}, * {@link PersianNormalizationFilter} and Persian Stop words */ public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new ArabicLetterTokenizer(reader); result = new LowerCaseFilter(result); result = new ArabicNormalizationFilter(result); /* additional persian-specific normalization */ result = new PersianNormalizationFilter(result); /* * the order here is important: the stopword list is normalized with the * above! */ result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stoptable); return result; }
//~ Methods ---------------------------------------------------------------- /// <summary> /// get token stream from input /// </summary> /// <param name="fieldName">lucene field name</param> /// <param name="reader">input reader</param> /// <returns>Token Stream</returns> public override sealed TokenStream TokenStream(String fieldName, TextReader reader) { return(new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), new CJKTokenizer(reader), stopTable)); }
public override TokenStreamComponents CreateComponents(string field, TextReader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); StopFilter filter = new StopFilter(TEST_VERSION_CURRENT, tokenizer, StandardAnalyzer.STOP_WORDS_SET); return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, filter, flags, protWords)); }
public BulgarianAnalyzer(Version matchVersion, HashSet <string> stopwords) { this.stoptable = new HashSet <string>(CharArraySet.Copy(stopwords)); this.matchVersion = matchVersion; this.enableStopPositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); }
public override TokenStreamComponents CreateComponents(string fieldName, Reader reader) { UAX29URLEmailTokenizer src = new UAX29URLEmailTokenizer(matchVersion, reader); src.MaxTokenLength = maxTokenLength; TokenStream tok = new StandardFilter(matchVersion, src); tok = new LowerCaseFilter(matchVersion, tok); tok = new StopFilter(matchVersion, tok, stopwords); return new TokenStreamComponentsAnonymousInnerClassHelper(this, src, tok, reader); }
/** * Builds an analyzer with the given stop words. */ public LithuanianAnalyzer(string[] stopwords) { stoptable = StopFilter.MakeStopSet(stopwords); }
/// <summary> /// Test Position increments applied by StopFilter with and without enabling this option. /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public void testStopPositons() throws java.io.IOException public virtual void testStopPositons() { StringBuilder sb = new StringBuilder(); List<string> a = new List<string>(); for (int i = 0; i < 20; i++) { string w = English.intToEnglish(i).trim(); sb.Append(w).Append(" "); if (i % 3 != 0) { a.Add(w); } } log(sb.ToString()); string[] stopWords = a.ToArray(); for (int i = 0; i < a.Count; i++) { log("Stop: " + stopWords[i]); } CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords); // with increments StringReader reader = new StringReader(sb.ToString()); StopFilter stpf = new StopFilter(Version.LUCENE_40, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet); doTestStopPositons(stpf,true); // without increments reader = new StringReader(sb.ToString()); stpf = new StopFilter(Version.LUCENE_43, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet); doTestStopPositons(stpf,false); // with increments, concatenating two stop filters List<string> a0 = new List<string>(); List<string> a1 = new List<string>(); for (int i = 0; i < a.Count; i++) { if (i % 2 == 0) { a0.Add(a[i]); } else { a1.Add(a[i]); } } string[] stopWords0 = a0.ToArray(); for (int i = 0; i < a0.Count; i++) { log("Stop0: " + stopWords0[i]); } string[] stopWords1 = a1.ToArray(); for (int i = 0; i < a1.Count; i++) { log("Stop1: " + stopWords1[i]); } CharArraySet stopSet0 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords0); CharArraySet stopSet1 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords1); reader = new StringReader(sb.ToString()); StopFilter stpf0 = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet0); // first part of the set stpf0.EnablePositionIncrements = true; StopFilter stpf01 = new StopFilter(TEST_VERSION_CURRENT, stpf0, stopSet1); // two stop filters concatenated! doTestStopPositons(stpf01,true); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="IrishLowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SnowballFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS); #pragma warning disable 612, 618 if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_44)) #pragma warning restore 612, 618 { s.EnablePositionIncrements = false; } result = s; result = new ElisionFilter(result, DEFAULT_ARTICLES); result = new IrishLowerCaseFilter(result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new IrishStemmer()); return new TokenStreamComponents(source, result); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="GreekLowerCaseFilter"/>, <seealso cref="StandardFilter"/>, /// <seealso cref="StopFilter"/>, and <seealso cref="GreekStemFilter"/> </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new GreekLowerCaseFilter(matchVersion, source); if (matchVersion.onOrAfter(Version.LUCENE_31)) { result = new StandardFilter(matchVersion, result); } result = new StopFilter(matchVersion, result, stopwords); if (matchVersion.onOrAfter(Version.LUCENE_31)) { result = new GreekStemFilter(result); } return new TokenStreamComponents(source, result); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SnowballFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); if (matchVersion.onOrAfter(Version.LUCENE_36)) { result = new ElisionFilter(result, DEFAULT_ARTICLES); } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new CatalanStemmer()); return new TokenStreamComponents(source, result); }
/// <summary>Builds the named analyzer with the given stop words. </summary> public SnowballAnalyzer(System.String name, System.String[] stopWords) : this(name) { stopSet = StopFilter.MakeStopSet(stopWords); }
/// <summary> /// /// </summary> /// <remarks></remarks> /// <seealso cref=""/> public BexisAnalyzer() { stoptable = StopFilter.MakeStopSet(GERMAN_STOP_WORDS); SynonymEngine = new BexisSynonymEngine(); }
/// <summary> /// Creates a TokenStream which tokenizes all the text in the provided TextReader. /// </summary> /// <param name="fieldName"></param> /// <param name="reader"></param> /// <returns>A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter</returns> public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new StandardTokenizer( reader ); result = new StandardFilter( result ); result = new StopFilter( result, stoptable ); result = new DutchStemFilter( result, excltable, _stemdict); return result; }
/* * Builds an exclusionlist from an array of Strings. * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead */ public void SetStemExclusionTable(params string[] exclusionlist) { excltable = StopFilter.MakeStopSet(exclusionlist); PreviousTokenStream = null; // force a new stemmer to be created }
/// <summary> /// Builds an analyzer with the given stop words. /// </summary> /// <param name="stopwords"></param> public DutchAnalyzer(String[] stopwords) { stoptable = StopFilter.MakeStopSet(stopwords); }
protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components) { var stopWords = stopWordsPerField[fieldName]; if (stopWords == null) { return components; } var stopFilter = new StopFilter(matchVersion, components.TokenStream, new CharArraySet(matchVersion, stopWords, false)); return new TokenStreamComponents(components.Tokenizer, stopFilter); }
//~ Constructors ----------------------------------------------------------- /**/ /** * Builds an analyzer which removes words in {@link #STOP_WORDS}. */ public PanGuAnalyzer() { stopTable = StopFilter.MakeStopSet(STOP_WORDS); }
public StandardAnalyzer(System.String[] stopWords) : this(Version.LUCENE_24, StopFilter.MakeStopSet(stopWords)) { }
/**/ /** * Builds an analyzer which removes words in the provided array. * * @param stopWords stop word array */ public PanGuAnalyzer(string[] stopWords) { stopTable = StopFilter.MakeStopSet(stopWords); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="SoraniNormalizationFilter"/>, /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided and <seealso cref="SoraniStemFilter"/>. </returns> protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader); Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new SoraniNormalizationFilter(result); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.Empty) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SoraniStemFilter(result); return new TokenStreamComponents(source, result); }
/* * Builds an analyzer with the given stop words. * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead */ public BrazilianAnalyzerCustom(Lucene.Net.Util.Version matchVersion, params string[] stopwords) : this(matchVersion, StopFilter.MakeStopSet(stopwords)) { }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new HebrewTokenizer(reader, PrefixTree); // Niqqud normalization result = new NiqqudFilter(result); // TODO: should we ignoreCase in StopFilter? result = new StopFilter(enableStopPositionIncrements, result, STOP_WORDS_SET); // TODO: Apply LowerCaseFilter to NonHebrew tokens only result = new LowerCaseFilter(result); if (suffixByTokenType != null && suffixByTokenType.Count > 0) result = new AddSuffixFilter(result, suffixByTokenType); return result; }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="GreekLowerCaseFilter"/>, <seealso cref="StandardFilter"/>, /// <seealso cref="StopFilter"/>, and <seealso cref="GreekStemFilter"/> </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new GreekLowerCaseFilter(matchVersion, source); #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) { result = new StandardFilter(matchVersion, result); } result = new StopFilter(matchVersion, result, stopwords); if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { result = new GreekStemFilter(result); } return new TokenStreamComponents(source, result); }
/// <summary> /// Creates a /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// which tokenizes all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> A /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from an <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="TurkishLowerCaseFilter"/>, /// <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem /// exclusion set is provided and <seealso cref="SnowballFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { result = new ApostropheFilter(result); } result = new TurkishLowerCaseFilter(result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Any()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new TurkishStemmer()); return new TokenStreamComponents(source, result); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/> /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is /// provided, and <seealso cref="SnowballFilter"/> </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { #pragma warning disable 612, 618 if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer()); return new TokenStreamComponents(source, result); } else { #pragma warning disable 612, 618 Tokenizer source = new RussianLetterTokenizer(matchVersion, reader); #pragma warning restore 612, 618 TokenStream result = new LowerCaseFilter(matchVersion, source); result = new StopFilter(matchVersion, result, stopwords); if (stemExclusionSet.Count > 0) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer()); return new TokenStreamComponents(source, result); } }
public void SetStemExclusionTable(String[] exclusionlist) { exclusionSet = StopFilter.MakeStopSet(exclusionlist); PreviousTokenStream = null; }
public override TokenStream Create(TokenStream input) { StopFilter stopFilter = new StopFilter(luceneMatchVersion, input, stopWords); stopFilter.EnablePositionIncrements = enablePositionIncrements; return stopFilter; }
/// <summary> /// Builds an analyzer which removes words in the provided array. /// </summary> /// <param name="stopWords">stop word array</param> public CJKAnalyzer(Version matchVersion, params string[] stopWords) { stopTable = StopFilter.MakeStopSet(stopWords); this.matchVersion = matchVersion; }
/** * Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}). */ public BrazilianAnalyzer() { stoptable = StopFilter.MakeStopSet(BRAZILIAN_STOP_WORDS); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { CharArraySet stopSet = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, "into"); Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filter = new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet); return new TokenStreamComponents(tokenizer, filter); }
/** * Builds an analyzer with the given stop words. */ public BrazilianAnalyzer(string[] stopwords) { stoptable = StopFilter.MakeStopSet(stopwords); }
public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result; try { result = _delegate.ReusableTokenStream(fieldName, reader); } catch (IOException) { result = _delegate.TokenStream(fieldName, reader); } var stopWords = stopWordsPerField[fieldName]; if (stopWords != null) { result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopWords); } return result; }
public SnowballAnalyzer(Version matchVersion, System.String name, System.String[] stopWords) : this(matchVersion, name) { stopSet = StopFilter.MakeStopSet(stopWords); }
/** * Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}). */ public LithuanianAnalyzer() { stoptable = StopFilter.MakeStopSet(STOP_WORDS); }
/// <summary> /// /// </summary> /// <remarks></remarks> /// <seealso cref=""/> public NGramAnalyzer() { stoptable = StopFilter.MakeStopSet(GERMAN_STOP_WORDS); }
/** * Builds an exclusionlist from an array of Strings. */ public void SetStemExclusionTable(string[] exclusionlist) { excltable = StopFilter.MakeStopSet(exclusionlist); }
/// <summary> /// Creates /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// used to tokenize all the text in the provided <seealso cref="Reader"/>. /// </summary> /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/> /// built from a <seealso cref="StandardTokenizer"/> filtered with /// <seealso cref="LowerCaseFilter"/>, <seealso cref="StandardFilter"/>, <seealso cref="StopFilter"/> /// , and <seealso cref="BrazilianStemFilter"/>. </returns> public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new LowerCaseFilter(matchVersion, source); result = new StandardFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (excltable != null && excltable.Count > 0) { result = new SetKeywordMarkerFilter(result, excltable); } return new TokenStreamComponents(source, new BrazilianStemFilter(result)); }