Example #1
0
 // other StopFilter functionality is already tested by TestStopAnalyzer
 //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
 //ORIGINAL LINE: public void testExactCase() throws java.io.IOException
 public virtual void testExactCase()
 {
     StringReader reader = new StringReader("Now is The Time");
     CharArraySet stopWords = new CharArraySet(TEST_VERSION_CURRENT, asSet("is", "the", "Time"), false);
     TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopWords);
     assertTokenStreamContents(stream, new string[] {"Now", "The"});
 }
 protected override TokenStreamComponents createComponents(string field, java.io.Reader reader)
 {
     var tokenizer = new PathTokenizer(reader);
     TokenStream tokenStream = new StandardFilter(tokenizer);
     tokenStream = new LowerCaseFilter(tokenStream);
     tokenStream = new StopFilter(tokenStream, StandardAnalyzer.STOP_WORDS_SET);
     return new TokenStreamComponents(tokenizer, tokenStream);
 }
 public virtual void TestStopFilt()
 {
     StringReader reader = new StringReader("Now is The Time");
     string[] stopWords = new string[] { "is", "the", "Time" };
     CharArraySet stopSet = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, stopWords);
     TokenStream stream = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
     AssertTokenStreamContents(stream, new string[] { "Now", "The" });
 }
 public override TokenStream TokenStream(string fieldname, TextReader reader)
 {
     TokenStream result = new StandardTokenizer(_version, reader);
     result = new LowerCaseFilter(result);
     result = new PersianNormalizationFilter(result);
     result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(_version), result, _stoptable);
     result = new PersianStemFilter(result);
     return result;
 }
Example #5
0
 public override TokenStream TokenStream(String fieldName, TextReader reader)
 {
     TokenStream ts = new StandardTokenizer(matchVersion, reader);
     ts = new StandardFilter(ts);
     ts = new ThaiWordFilter(ts);
     ts = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                         ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
     return ts;
 }
Example #6
0
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from a <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="IndicNormalizationFilter"/>,
        ///         <seealso cref="HindiNormalizationFilter"/>, <seealso cref="SetKeywordMarkerFilter"/>
        ///         if a stem exclusion set is provided, <seealso cref="HindiStemFilter"/>, and
        ///         Hindi Stop words </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer source;

            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
            {
                source = new StandardTokenizer(matchVersion, reader);
            }
            else
            {
                source = new IndicTokenizer(matchVersion, reader);
            }
            TokenStream result = new LowerCaseFilter(matchVersion, source);

            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new IndicNormalizationFilter(result);
            result = new HindiNormalizationFilter(result);
            result = new StopFilter(matchVersion, result, stopwords);
            result = new HindiStemFilter(result);
            return(new TokenStreamComponents(source, result));
        }
        public ArrayList getKeywords(string result, string tipoAnalizador)
        {
            ArrayList   ListStemsList = new ArrayList();
            TokenStream tokenStream   = new StandardTokenizer(new System.IO.StringReader(result));

            tokenStream = new StandardFilter(tokenStream);  //elimina los signos de puntuación
            tokenStream = new LowerCaseFilter(tokenStream); //convierte el contenido a minúsculas
            if (tipoAnalizador == "Español")
            {
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(tokenStream, StopFilter.MakeStopSet(SpanishAnalyzer.SPANISH_STOP_WORDS), true);
                //Operacion de lematización de la palabras
                //SpanishAnalyzer ansp = new SpanishAnalyzer();
                //tokenStream = ansp.SpanishSteammer(tokenStream);
            }
            else
            {
                //filtrará el contenido con el listado de stopWords
                tokenStream = new StopFilter(tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS, true);
                //Operacion de lematización de la palabras
                //tokenStream = new PorterStemFilter(tokenStream);
            }

            string cadena = "";

            string[] token;
            Lucene.Net.Analysis.Token current;
            while ((current = tokenStream.Next()) != null)
            {
                cadena = current.ToString();
                token  = cadena.Split(',');
                cadena = cadena.Substring(1, token[0].Length - 1);
                ListStemsList.Add(cadena);
            }
            return(ListStemsList);
        }
Example #8
0
        /// <summary>
        /// Returns a (possibly reused) <see cref="TokenStream"/> which tokenizes all the
        /// text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> A <see cref="TokenStream"/> built from a <see cref="StandardTokenizer"/>
        ///   filtered with <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>,
        ///   <see cref="StopFilter"/>, <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is provided,
        ///   <see cref="StemmerOverrideFilter"/>, and <see cref="SnowballFilter"/> </returns>
        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader aReader)
        {
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                Tokenizer   source = new StandardTokenizer(matchVersion, aReader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new LowerCaseFilter(matchVersion, result);
                result = new StopFilter(matchVersion, result, stoptable);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
                if (stemdict != null)
                {
                    result = new StemmerOverrideFilter(result, stemdict);
                }
                result = new SnowballFilter(result, new Tartarus.Snowball.Ext.DutchStemmer());
                return(new TokenStreamComponents(source, result));
            }
            else
            {
                Tokenizer   source = new StandardTokenizer(matchVersion, aReader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new StopFilter(matchVersion, result, stoptable);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
#pragma warning disable 612, 618
                result = new DutchStemFilter(result, origStemdict);
#pragma warning restore 612, 618
                return(new TokenStreamComponents(source, result));
            }
        }
Example #9
0
        /**
         * Creates a token stream that tokenizes the given string into token terms
         * (aka words).
         * 
         * @param fieldName
         *            the name of the field to tokenize (currently ignored).
         * @param text
         *            the string to tokenize
         * @return a new token stream
         */
        public TokenStream TokenStream(String fieldName, String text)
        {
            // Ideally the Analyzer superclass should have a method with the same signature, 
            // with a default impl that simply delegates to the StringReader flavour. 
            if (text == null)
                throw new ArgumentException("text must not be null");

            TokenStream stream;
            if (Regex == NON_WORD_PATTERN)
            { // fast path
                stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
            }
            else if (Regex == WHITESPACE_PATTERN)
            { // fast path
                stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
            }
            else
            {
                stream = new RegexTokenizer(text, Regex, toLowerCase);
                if (stopWords != null) stream = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), stream, stopWords);
            }

            return stream;
        }
Example #10
0
 public StandardAnalyzer(System.String[] stopwords, bool replaceInvalidAcronym) : this(Version.LUCENE_24, StopFilter.MakeStopSet(stopwords))
 {
     this.replaceInvalidAcronym = replaceInvalidAcronym;
 }
Example #11
0
 /// <summary>
 /// Builds an analyzer which removes words in the provided array.
 /// </summary>
 /// <param name="stopWords">stop word array</param>
 public CJKAnalyzer(String[] stopWords)
 {
     stopTable = StopFilter.MakeStopSet(stopWords);
 }
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="EnglishPossessiveFilter"/>, 
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="PorterStemFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);
            // prior to this we get the classic behavior, standardfilter does it for us.
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                result = new EnglishPossessiveFilter(matchVersion, result);
            }
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (stemExclusionSet.Any())
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new PorterStemFilter(result);
            return new TokenStreamComponents(source, result);
        }
        private static void Initialize()
        {
            if (_initialized)
            {
                return;
            }
            lock (_lock)
            {
                if (_initialized)
                {
                    return;
                }
                try
                {
                    LuceneSection section = (LuceneSection)ConfigurationManager.GetSection("lucene.indexing");
                    _indexAllTypes = section.IndexAllTypes;
                    _isActive      = section.Active;
                    if (_isActive == null || !_isActive.Value)
                    {
                        _initialized = true;
                        return;
                    }
                    _fieldPrefix   = section.Prefix;
                    _luceneVersion = section.LuceneVersion;
                    _includedTypes = new Dictionary <string, ContentTypeDocument>();
                    string[] strArray             = new string[0];
                    var      fieldAnalyzerWrapper = new PerFieldAnalyzerWrapper((Analyzer) new StandardAnalyzer(LuceneVersion, StopFilter.MakeStopSet(strArray)));
                    if (!IndexAllTypes)
                    {
                        foreach (IncludedTypeElement typeSetting in section.IncludedTypes)
                        {
                            Type contentType = Type.GetType(typeSetting.Type, true, true);
                            if (contentType == null)
                            {
                                continue;
                            }
                            if (!_includedTypes.TryGetValue(typeSetting.Name, out var tmp))
                            {
                                var documentIndexModel = new ContentTypeDocument();
                                documentIndexModel.ContentType    = contentType;
                                documentIndexModel.IndexAllFields = typeSetting.IndexAllFields;
                                _includedTypes.Add(typeSetting.Name, documentIndexModel);
                                foreach (IncludedFieldElement fieldSetting in typeSetting.IncludedFields)
                                {
                                    Type fieldType;
                                    if (string.IsNullOrEmpty(fieldSetting.Type))
                                    {
                                        fieldType = typeof(DefaultComputedField);
                                    }
                                    else
                                    {
                                        fieldType = Type.GetType(fieldSetting.Type, true, true);
                                    }
                                    if (!typeof(IComputedField).IsAssignableFrom(fieldType))
                                    {
                                        continue;
                                    }
                                    var  instance     = (IComputedField)Activator.CreateInstance(fieldType);
                                    Type analyzerType = Type.GetType(fieldSetting.Analyzer, true, true);
                                    if (!typeof(Analyzer).IsAssignableFrom(analyzerType))
                                    {
                                        continue;
                                    }
                                    if (analyzerType == typeof(StandardAnalyzer))
                                    {
                                        instance.Analyzer = new StandardAnalyzer(LuceneVersion, StopFilter.MakeStopSet(strArray));
                                    }
                                    else
                                    {
                                        instance.Analyzer = (Analyzer)Activator.CreateInstance(analyzerType);
                                    }
                                    instance.Index    = fieldSetting.Index;
                                    instance.Store    = fieldSetting.Store;
                                    instance.Vector   = fieldSetting.Vector;
                                    instance.DataType = fieldSetting.DataType;
                                    if (!documentIndexModel.IndexedFields.TryGetValue(fieldSetting.Name, out var tmp2))
                                    {
                                        documentIndexModel.IndexedFields.Add(fieldSetting.Name, instance);
                                        fieldAnalyzerWrapper.AddAnalyzer(ContentIndexHelpers.GetIndexFieldName(fieldSetting.Name), instance.Analyzer);
                                    }
                                }
                            }
                        }
                    }
                    _analyzer = fieldAnalyzerWrapper;
                    AddDefaultFieldAnalyzer();
                    var shardingStrategy = section.Sharding?.Strategy;
                    if (!string.IsNullOrEmpty(shardingStrategy))
                    {
                        var shardingType = Type.GetType(shardingStrategy, true, true);
                        LuceneContext.IndexShardingStrategy = (IIndexShardingStrategy)Activator.CreateInstance(shardingType);
                        LuceneContext.IndexShardingStrategy.WarmupShards();
                        return;
                    }
                    Directory directory;
                    var       directoryConnectionString = ConfigurationManager.AppSettings["lucene:BlobConnectionString"] ?? "App_Data/My_Index";
                    var       directoryContainerName    = ConfigurationManager.AppSettings["lucene:ContainerName"] ?? "lucene";
                    var       directoryType             = (ConfigurationManager.AppSettings["lucene:DirectoryType"] ?? "Filesystem").ToLower();
                    switch (directoryType)
                    {
                    case Constants.ContainerType.Azure:
                        var connectionString = directoryConnectionString;
                        var containerName    = directoryContainerName;
                        var storageAccount   = CloudStorageAccount.Parse(connectionString);
                        var azureDir         = new FastAzureDirectory(storageAccount, containerName, new RAMDirectory());
                        directory = azureDir;
                        break;

                    default:
                        var folderPath  = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, directoryConnectionString);
                        var fsDirectory = FSDirectory.Open(folderPath);
                        directory = fsDirectory;
                        break;
                    }
                    _directory = directory;
                    InitDirectory(_directory);
                }
                catch (Exception ex)
                {
                    throw ex;
                }
                _initialized = true;
            }
        }
Example #14
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new StreamLemmasFilter(reader, hebMorphLemmatizer,
                lemmaFilter, alwaysSaveMarkedOriginal);

            // This stop filter is here temporarily, until HebMorph is smart enough to clear stop words
            // all by itself
            result = new StopFilter(enableStopPositionIncrements, result, STOP_WORDS_SET);

            return result;
        }
 /// <summary>
 /// Creates a
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A
 ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
 ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
 ///         provided and <seealso cref="SnowballFilter"/>. </returns>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (stemExclusionSet.Count > 0)
     {
         result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new SnowballFilter(result, new DanishStemmer());
     return new TokenStreamComponents(source, result);
 }
Example #16
0
 public SpanishAnalyzer()
 {
     stopTable = StopFilter.MakeStopSet(SPANISH_STOP_WORDS);
 }
 /// <summary>Builds an analyzer with the given stop words. </summary>
 public StandardAnalyzer(System.String[] stopWords)
 {
     stopSet = StopFilter.MakeStopSet(stopWords);
 }
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from a <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>,
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>,
        ///         <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided, and <seealso cref="FrenchLightStemFilter"/> </returns>
        ///         
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                Tokenizer source = new StandardTokenizer(matchVersion, reader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new ElisionFilter(result, DEFAULT_ARTICLES);
                result = new LowerCaseFilter(matchVersion, result);
                result = new StopFilter(matchVersion, result, stopwords);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
#pragma warning disable 612, 618
                if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36))
#pragma warning restore 612, 618
                {
                    result = new FrenchLightStemFilter(result);
                }
                else
                {
                    result = new SnowballFilter(result, new Tartarus.Snowball.Ext.FrenchStemmer());
                }
                return new TokenStreamComponents(source, result);
            }
            else
            {
                Tokenizer source = new StandardTokenizer(matchVersion, reader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new StopFilter(matchVersion, result, stopwords);
                if (excltable.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, excltable);
                }
#pragma warning disable 612, 618
                result = new FrenchStemFilter(result);
#pragma warning restore 612, 618
                // Convert to lowercase after stemming!
                return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
            }
        }
Example #19
0
 /*
  * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
  *
  * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
  *                  {@link GreekLowerCaseFilter} and {@link StopFilter}
  */
 public override TokenStream TokenStream(String fieldName, TextReader reader)
 {
     TokenStream result = new StandardTokenizer(matchVersion, reader);
     result = new GreekLowerCaseFilter(result);
     result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                             result, stopSet);
     return result;
 }
Example #20
0
 /**
  * Builds an analyzer with the given stop words.
  */
 public ArabicAnalyzer(Version matchVersion, string[] stopwords)
 {
     stoptable         = StopFilter.MakeStopSet(stopwords);
     this.matchVersion = matchVersion;
 }
Example #21
0
 /// <summary>
 /// Constructor that allows you to specify your stop words
 /// </summary>
 /// <param name="stopWords">Stopwords to use (lucene will not index these words) - should be all lowercase</param>
 public EnglishAnalyzer(IEnumerable <string> stopWords)
 {
     _words = StopFilter.MakeStopSet(LeoLuceneVersion.Version, stopWords.ToArray());
 }
 /// <summary>
 /// Constructs a <seealso cref="StandardTokenizer"/> filtered by a {@link
 ///    StandardFilter}, a <seealso cref="LowerCaseFilter"/>, a <seealso cref="StopFilter"/>,
 ///    and a <seealso cref="SnowballFilter"/> 
 /// </summary>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, tokenizer);
     // remove the possessive 's for english stemmers
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && (name.Equals("English") || name.Equals("Porter") || name.Equals("Lovins")))
     {
         result = new EnglishPossessiveFilter(result);
     }
     // Use a special lowercase filter for turkish, the stemmer expects it.
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31) && name.Equals("Turkish"))
     {
         result = new TurkishLowerCaseFilter(result);
     }
     else
     {
         result = new LowerCaseFilter(matchVersion, result);
     }
     if (stopSet != null)
     {
         result = new StopFilter(matchVersion, result, stopSet);
     }
     result = new SnowballFilter(result, name);
     return new TokenStreamComponents(tokenizer, result);
 }
Example #23
0
 public GermanAnalyzer(Version matchVersion, params string[] stopwords)
     : this(matchVersion, StopFilter.MakeStopSet(stopwords))
 {
 }
Example #24
0
 /**
  * Creates a {@link TokenStream} which tokenizes all the text in the provided
  * {@link Reader}.
  * 
  * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
  *         filtered with {@link LowerCaseFilter}, 
  *         {@link ArabicNormalizationFilter},
  *         {@link PersianNormalizationFilter} and Persian Stop words
  */
 public override TokenStream TokenStream(String fieldName, TextReader reader)
 {
     TokenStream result = new ArabicLetterTokenizer(reader);
     result = new LowerCaseFilter(result);
     result = new ArabicNormalizationFilter(result);
     /* additional persian-specific normalization */
     result = new PersianNormalizationFilter(result);
     /*
      * the order here is important: the stopword list is normalized with the
      * above!
      */
     result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                             result, stoptable);
     return result;
 }
Example #25
0
        //~ Methods ----------------------------------------------------------------

        /// <summary>
        /// get token stream from input
        /// </summary>
        /// <param name="fieldName">lucene field name</param>
        /// <param name="reader">input reader</param>
        /// <returns>Token Stream</returns>
        public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
        {
            return(new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                  new CJKTokenizer(reader), stopTable));
        }
 public override TokenStreamComponents CreateComponents(string field, TextReader reader)
 {
     Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
     StopFilter filter = new StopFilter(TEST_VERSION_CURRENT, tokenizer, StandardAnalyzer.STOP_WORDS_SET);
     return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, filter, flags, protWords));
 }
 public BulgarianAnalyzer(Version matchVersion, HashSet <string> stopwords)
 {
     this.stoptable    = new HashSet <string>(CharArraySet.Copy(stopwords));
     this.matchVersion = matchVersion;
     this.enableStopPositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion);
 }
        public override TokenStreamComponents CreateComponents(string fieldName, Reader reader)
	  {
		UAX29URLEmailTokenizer src = new UAX29URLEmailTokenizer(matchVersion, reader);
		src.MaxTokenLength = maxTokenLength;
		TokenStream tok = new StandardFilter(matchVersion, src);
		tok = new LowerCaseFilter(matchVersion, tok);
		tok = new StopFilter(matchVersion, tok, stopwords);
		return new TokenStreamComponentsAnonymousInnerClassHelper(this, src, tok, reader);
	  }
Example #29
0
 /**
  * Builds an analyzer with the given stop words.
  */
 public LithuanianAnalyzer(string[] stopwords)
 {
     stoptable = StopFilter.MakeStopSet(stopwords);
 }
Example #30
0
 /// <summary>
 /// Test Position increments applied by StopFilter with and without enabling this option.
 /// </summary>
 //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
 //ORIGINAL LINE: public void testStopPositons() throws java.io.IOException
 public virtual void testStopPositons()
 {
     StringBuilder sb = new StringBuilder();
     List<string> a = new List<string>();
     for (int i = 0; i < 20; i++)
     {
       string w = English.intToEnglish(i).trim();
       sb.Append(w).Append(" ");
       if (i % 3 != 0)
       {
       a.Add(w);
       }
     }
     log(sb.ToString());
     string[] stopWords = a.ToArray();
     for (int i = 0; i < a.Count; i++)
     {
     log("Stop: " + stopWords[i]);
     }
     CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords);
     // with increments
     StringReader reader = new StringReader(sb.ToString());
     StopFilter stpf = new StopFilter(Version.LUCENE_40, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
     doTestStopPositons(stpf,true);
     // without increments
     reader = new StringReader(sb.ToString());
     stpf = new StopFilter(Version.LUCENE_43, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
     doTestStopPositons(stpf,false);
     // with increments, concatenating two stop filters
     List<string> a0 = new List<string>();
     List<string> a1 = new List<string>();
     for (int i = 0; i < a.Count; i++)
     {
       if (i % 2 == 0)
       {
     a0.Add(a[i]);
       }
       else
       {
     a1.Add(a[i]);
       }
     }
     string[] stopWords0 = a0.ToArray();
     for (int i = 0; i < a0.Count; i++)
     {
     log("Stop0: " + stopWords0[i]);
     }
     string[] stopWords1 = a1.ToArray();
     for (int i = 0; i < a1.Count; i++)
     {
     log("Stop1: " + stopWords1[i]);
     }
     CharArraySet stopSet0 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords0);
     CharArraySet stopSet1 = StopFilter.makeStopSet(TEST_VERSION_CURRENT, stopWords1);
     reader = new StringReader(sb.ToString());
     StopFilter stpf0 = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet0); // first part of the set
     stpf0.EnablePositionIncrements = true;
     StopFilter stpf01 = new StopFilter(TEST_VERSION_CURRENT, stpf0, stopSet1); // two stop filters concatenated!
     doTestStopPositons(stpf01,true);
 }
 /// <summary>
 /// Creates a
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A
 ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="IrishLowerCaseFilter"/>, <seealso cref="StopFilter"/>
 ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
 ///         provided and <seealso cref="SnowballFilter"/>. </returns>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS);
     #pragma warning disable 612, 618
     if (!matchVersion.OnOrAfter(LuceneVersion.LUCENE_44))
     #pragma warning restore 612, 618
     {
         s.EnablePositionIncrements = false;
     }
     result = s;
     result = new ElisionFilter(result, DEFAULT_ARTICLES);
     result = new IrishLowerCaseFilter(result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (stemExclusionSet.Count > 0)
     {
         result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new SnowballFilter(result, new IrishStemmer());
     return new TokenStreamComponents(source, result);
 }
Example #32
0
	  /// <summary>
	  /// Creates
	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
	  /// </summary>
	  /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  ///         built from a <seealso cref="StandardTokenizer"/> filtered with
	  ///         <seealso cref="GreekLowerCaseFilter"/>, <seealso cref="StandardFilter"/>,
	  ///         <seealso cref="StopFilter"/>, and <seealso cref="GreekStemFilter"/> </returns>
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		Tokenizer source = new StandardTokenizer(matchVersion, reader);
		TokenStream result = new GreekLowerCaseFilter(matchVersion, source);
		if (matchVersion.onOrAfter(Version.LUCENE_31))
		{
		  result = new StandardFilter(matchVersion, result);
		}
		result = new StopFilter(matchVersion, result, stopwords);
		if (matchVersion.onOrAfter(Version.LUCENE_31))
		{
		  result = new GreekStemFilter(result);
		}
		return new TokenStreamComponents(source, result);
	  }
Example #33
0
	  /// <summary>
	  /// Creates a
	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
	  /// </summary>
	  /// <returns> A
	  ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  ///         built from an <seealso cref="StandardTokenizer"/> filtered with
	  ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, 
	  ///         <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
	  ///         provided and <seealso cref="SnowballFilter"/>. </returns>
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		Tokenizer source = new StandardTokenizer(matchVersion, reader);
		TokenStream result = new StandardFilter(matchVersion, source);
		if (matchVersion.onOrAfter(Version.LUCENE_36))
		{
		  result = new ElisionFilter(result, DEFAULT_ARTICLES);
		}
		result = new LowerCaseFilter(matchVersion, result);
		result = new StopFilter(matchVersion, result, stopwords);
		if (!stemExclusionSet.Empty)
		{
		  result = new SetKeywordMarkerFilter(result, stemExclusionSet);
		}
		result = new SnowballFilter(result, new CatalanStemmer());
		return new TokenStreamComponents(source, result);
	  }
Example #34
0
 /// <summary>Builds the named analyzer with the given stop words. </summary>
 public SnowballAnalyzer(System.String name, System.String[] stopWords) : this(name)
 {
     stopSet = StopFilter.MakeStopSet(stopWords);
 }
Example #35
0
 /// <summary>
 ///
 /// </summary>
 /// <remarks></remarks>
 /// <seealso cref=""/>
 public BexisAnalyzer()
 {
     stoptable     = StopFilter.MakeStopSet(GERMAN_STOP_WORDS);
     SynonymEngine = new BexisSynonymEngine();
 }
Example #36
0
		/// <summary>
		/// Creates a TokenStream which tokenizes all the text in the provided TextReader. 
		/// </summary>
		/// <param name="fieldName"></param>
		/// <param name="reader"></param>
		/// <returns>A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter</returns>
		public override TokenStream TokenStream(String fieldName, TextReader reader)
		{
			TokenStream result = new StandardTokenizer( reader );
			result = new StandardFilter( result );
			result = new StopFilter( result, stoptable );
			result = new DutchStemFilter( result, excltable, _stemdict);
			return result;
		}
Example #37
0
        /*
         * Builds an exclusionlist from an array of Strings.
         * @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
         */

        public void SetStemExclusionTable(params string[] exclusionlist)
        {
            excltable           = StopFilter.MakeStopSet(exclusionlist);
            PreviousTokenStream = null;    // force a new stemmer to be created
        }
Example #38
0
 /// <summary>
 /// Builds an analyzer with the given stop words.
 /// </summary>
 /// <param name="stopwords"></param>
 public DutchAnalyzer(String[] stopwords)
 {
     stoptable = StopFilter.MakeStopSet(stopwords);
 }
 protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
 {
     var stopWords = stopWordsPerField[fieldName];
     if (stopWords == null)
     {
         return components;
     }
     var stopFilter = new StopFilter(matchVersion, components.TokenStream, new CharArraySet(matchVersion, stopWords, false));
     return new TokenStreamComponents(components.Tokenizer, stopFilter);
 }
Example #40
0
        //~ Constructors -----------------------------------------------------------

        /**/

        /**
         * Builds an analyzer which removes words in {@link #STOP_WORDS}.
         */
        public PanGuAnalyzer()
        {
            stopTable = StopFilter.MakeStopSet(STOP_WORDS);
        }
Example #41
0
 public StandardAnalyzer(System.String[] stopWords) : this(Version.LUCENE_24, StopFilter.MakeStopSet(stopWords))
 {
 }
Example #42
0
        /**/

        /**
         * Builds an analyzer which removes words in the provided array.
         *
         * @param stopWords stop word array
         */
        public PanGuAnalyzer(string[] stopWords)
        {
            stopTable = StopFilter.MakeStopSet(stopWords);
        }
Example #43
0
	  /// <summary>
	  /// Creates a
	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
	  /// </summary>
	  /// <returns> A
	  ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
	  ///         built from an <seealso cref="StandardTokenizer"/> filtered with
	  ///         <seealso cref="StandardFilter"/>, <seealso cref="SoraniNormalizationFilter"/>, 
	  ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
	  ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
	  ///         provided and <seealso cref="SoraniStemFilter"/>. </returns>
	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
	  {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
		Tokenizer source = new StandardTokenizer(matchVersion, reader);
		TokenStream result = new StandardFilter(matchVersion, source);
		result = new SoraniNormalizationFilter(result);
		result = new LowerCaseFilter(matchVersion, result);
		result = new StopFilter(matchVersion, result, stopwords);
		if (!stemExclusionSet.Empty)
		{
		  result = new SetKeywordMarkerFilter(result, stemExclusionSet);
		}
		result = new SoraniStemFilter(result);
		return new TokenStreamComponents(source, result);
	  }
Example #44
0
        /*
         * Builds an analyzer with the given stop words.
         * @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
         */

        public BrazilianAnalyzerCustom(Lucene.Net.Util.Version matchVersion, params string[] stopwords)
            : this(matchVersion, StopFilter.MakeStopSet(stopwords))
        {
        }
Example #45
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new HebrewTokenizer(reader, PrefixTree);

            // Niqqud normalization
            result = new NiqqudFilter(result);

            // TODO: should we ignoreCase in StopFilter?
            result = new StopFilter(enableStopPositionIncrements, result, STOP_WORDS_SET);

            // TODO: Apply LowerCaseFilter to NonHebrew tokens only
            result = new LowerCaseFilter(result);

            if (suffixByTokenType != null && suffixByTokenType.Count > 0)
                result = new AddSuffixFilter(result, suffixByTokenType);

            return result;
        }
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from a <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="GreekLowerCaseFilter"/>, <seealso cref="StandardFilter"/>,
        ///         <seealso cref="StopFilter"/>, and <seealso cref="GreekStemFilter"/> </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new GreekLowerCaseFilter(matchVersion, source);
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
            {
                result = new StandardFilter(matchVersion, result);
            }
            result = new StopFilter(matchVersion, result, stopwords);
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                result = new GreekStemFilter(result);
            }
            return new TokenStreamComponents(source, result);
        }
 /// <summary>
 /// Creates a
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> A
 ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from an <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="StandardFilter"/>, <seealso cref="TurkishLowerCaseFilter"/>,
 ///         <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem
 ///         exclusion set is provided and <seealso cref="SnowballFilter"/>. </returns>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new StandardFilter(matchVersion, source);
     if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
     {
         result = new ApostropheFilter(result);
     }
     result = new TurkishLowerCaseFilter(result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (stemExclusionSet.Any())
     {
         result = new SetKeywordMarkerFilter(result, stemExclusionSet);
     }
     result = new SnowballFilter(result, new TurkishStemmer());
     return new TokenStreamComponents(source, result);
 }
        /// <summary>
        /// Creates
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from a <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided, and <seealso cref="SnowballFilter"/> </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                Tokenizer source = new StandardTokenizer(matchVersion, reader);
                TokenStream result = new StandardFilter(matchVersion, source);
                result = new LowerCaseFilter(matchVersion, result);
                result = new StopFilter(matchVersion, result, stopwords);
                if (stemExclusionSet.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, stemExclusionSet);
                }
                result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer());
                return new TokenStreamComponents(source, result);
            }
            else
            {
#pragma warning disable 612, 618
                Tokenizer source = new RussianLetterTokenizer(matchVersion, reader);
#pragma warning restore 612, 618
                TokenStream result = new LowerCaseFilter(matchVersion, source);
                result = new StopFilter(matchVersion, result, stopwords);
                if (stemExclusionSet.Count > 0)
                {
                    result = new SetKeywordMarkerFilter(result, stemExclusionSet);
                }
                result = new SnowballFilter(result, new Tartarus.Snowball.Ext.RussianStemmer());
                return new TokenStreamComponents(source, result);
            }
        }
Example #49
0
 public void SetStemExclusionTable(String[] exclusionlist)
 {
     exclusionSet        = StopFilter.MakeStopSet(exclusionlist);
     PreviousTokenStream = null;
 }
 public override TokenStream Create(TokenStream input)
 {
     StopFilter stopFilter = new StopFilter(luceneMatchVersion, input, stopWords);
     stopFilter.EnablePositionIncrements = enablePositionIncrements;
     return stopFilter;
 }
Example #51
0
 /// <summary>
 /// Builds an analyzer which removes words in the provided array.
 /// </summary>
 /// <param name="stopWords">stop word array</param>
 public CJKAnalyzer(Version matchVersion, params string[] stopWords)
 {
     stopTable         = StopFilter.MakeStopSet(stopWords);
     this.matchVersion = matchVersion;
 }
 /**
  * Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}).
  */
 public BrazilianAnalyzer()
 {
     stoptable = StopFilter.MakeStopSet(BRAZILIAN_STOP_WORDS);
 }
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     CharArraySet stopSet = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, "into");
     Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
     TokenFilter filter = new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet);
     return new TokenStreamComponents(tokenizer, filter);
 }
 /**
  * Builds an analyzer with the given stop words.
  */
 public BrazilianAnalyzer(string[] stopwords)
 {
     stoptable = StopFilter.MakeStopSet(stopwords);
 }
 public override TokenStream TokenStream(String fieldName, TextReader reader)
 {
     TokenStream result;
     try {
       result = _delegate.ReusableTokenStream(fieldName, reader);
     } catch (IOException) {
       result = _delegate.TokenStream(fieldName, reader);
     }
     var stopWords = stopWordsPerField[fieldName];
     if (stopWords != null) {
       result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                       result, stopWords);
     }
     return result;
 }
Example #56
0
 public SnowballAnalyzer(Version matchVersion, System.String name, System.String[] stopWords)
     : this(matchVersion, name)
 {
     stopSet = StopFilter.MakeStopSet(stopWords);
 }
Example #57
0
 /**
  * Builds an analyzer with the default stop words ({@link #BRAZILIAN_STOP_WORDS}).
  */
 public LithuanianAnalyzer()
 {
     stoptable = StopFilter.MakeStopSet(STOP_WORDS);
 }
Example #58
0
 /// <summary>
 ///
 /// </summary>
 /// <remarks></remarks>
 /// <seealso cref=""/>
 public NGramAnalyzer()
 {
     stoptable = StopFilter.MakeStopSet(GERMAN_STOP_WORDS);
 }
Example #59
0
 /**
  * Builds an exclusionlist from an array of Strings.
  */
 public void SetStemExclusionTable(string[] exclusionlist)
 {
     excltable = StopFilter.MakeStopSet(exclusionlist);
 }
 /// <summary>
 /// Creates
 /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
 /// </summary>
 /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
 ///         built from a <seealso cref="StandardTokenizer"/> filtered with
 ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StandardFilter"/>, <seealso cref="StopFilter"/>
 ///         , and <seealso cref="BrazilianStemFilter"/>. </returns>
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer source = new StandardTokenizer(matchVersion, reader);
     TokenStream result = new LowerCaseFilter(matchVersion, source);
     result = new StandardFilter(matchVersion, result);
     result = new StopFilter(matchVersion, result, stopwords);
     if (excltable != null && excltable.Count > 0)
     {
         result = new SetKeywordMarkerFilter(result, excltable);
     }
     return new TokenStreamComponents(source, new BrazilianStemFilter(result));
 }