public virtual void TestNoOverrides()
 {
     StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true);
     Tokenizer tokenizer = new KeywordTokenizer(new StringReader("book"));
     TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build()));
     AssertTokenStreamContents(stream, new string[] { "book" });
 }
 public virtual void TestWithKeywordAttribute()
 {
     CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
     set.add("yourselves");
     Tokenizer tokenizer = new MockTokenizer(new StringReader("yourselves yours"), MockTokenizer.WHITESPACE, false);
     TokenStream filter = new PorterStemFilter(new SetKeywordMarkerFilter(tokenizer, set));
     AssertTokenStreamContents(filter, new string[] { "yourselves", "your" });
 }
Example #3
0
        public virtual void TestWithKeywordAttribute()
        {
            CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);

            set.add("yourselves");
            Tokenizer   tokenizer = new MockTokenizer(new StringReader("yourselves yours"), MockTokenizer.WHITESPACE, false);
            TokenStream filter    = new PorterStemFilter(new SetKeywordMarkerFilter(tokenizer, set));

            AssertTokenStreamContents(filter, new string[] { "yourselves", "your" });
        }
 public virtual void TestIgnoreCase()
 {
     // lets make booked stem to books
     // the override filter will convert "booked" to "books",
     // but also mark it with KeywordAttribute so Porter will not change it.
     StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true);
     builder.Add("boOkEd", "books");
     Tokenizer tokenizer = new KeywordTokenizer(new StringReader("BooKeD"));
     TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build()));
     AssertTokenStreamContents(stream, new string[] { "books" });
 }
Example #5
0
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="EnglishPossessiveFilter"/>,
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="PorterStemFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);

            // prior to this we get the classic behavior, standardfilter does it for us.
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
            {
                result = new EnglishPossessiveFilter(matchVersion, result);
            }
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (stemExclusionSet.Any())
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new PorterStemFilter(result);
            return(new TokenStreamComponents(source, result));
        }
Example #6
0
        /// <summary>
        /// Creates a
        /// <see cref="TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <see cref="TextReader"/>.
        /// </summary>
        /// <returns> A
        ///         <see cref="TokenStreamComponents"/>
        ///         built from an <see cref="StandardTokenizer"/> filtered with
        ///         <see cref="StandardFilter"/>, <see cref="EnglishPossessiveFilter"/>,
        ///         <see cref="LowerCaseFilter"/>, <see cref="StopFilter"/>,
        ///         <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <see cref="PorterStemFilter"/>. </returns>
        protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer   source = new StandardTokenizer(m_matchVersion, reader);
            TokenStream result = new StandardFilter(m_matchVersion, source);

            // prior to this we get the classic behavior, standardfilter does it for us.
#pragma warning disable 612, 618
            if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                result = new EnglishPossessiveFilter(m_matchVersion, result);
            }
            result = new LowerCaseFilter(m_matchVersion, result);
            result = new StopFilter(m_matchVersion, result, m_stopwords);
            if (stemExclusionSet.Count > 0)
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new PorterStemFilter(result);
            return(new TokenStreamComponents(source, result));
        }
        public virtual void TestRandomRealisticWhiteSpace()
        {
            IDictionary<string, string> map = new Dictionary<string, string>();
            int numTerms = AtLeast(50);
            for (int i = 0; i < numTerms; i++)
            {
                string randomRealisticUnicodeString = TestUtil.RandomRealisticUnicodeString(Random());
                char[] charArray = randomRealisticUnicodeString.ToCharArray();
                StringBuilder sb = new StringBuilder();
                for (int j = 0; j < charArray.Length;)
                {
                    int cp = Character.CodePointAt(charArray, j, charArray.Length);
                    if (!char.IsWhiteSpace((char)cp))
                    {
                        sb.Append(cp);
                    }
                    j += Character.CharCount(cp);
                }
                if (sb.Length > 0)
                {
                    string value = TestUtil.RandomSimpleString(Random());
                    map[sb.ToString()] = value.Length == 0 ? "a" : value;

                }
            }
            if (map.Count == 0)
            {
                map["booked"] = "books";
            }
            StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(Random().nextBoolean());
            IDictionary<string, string> entrySet = map;
            StringBuilder input = new StringBuilder();
            IList<string> output = new List<string>();
            foreach (KeyValuePair<string, string> entry in entrySet)
            {
                builder.Add(entry.Key, entry.Value);
                if (Random().nextBoolean() || output.Count == 0)
                {
                    input.Append(entry.Key).Append(" ");
                    output.Add(entry.Value);
                }
            }
            Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input.ToString()));
            TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build()));
            AssertTokenStreamContents(stream, output.ToArray());
        }
 public virtual void TestRandomRealisticKeyword()
 {
     IDictionary<string, string> map = new Dictionary<string, string>();
     int numTerms = AtLeast(50);
     for (int i = 0; i < numTerms; i++)
     {
         string randomRealisticUnicodeString = TestUtil.RandomRealisticUnicodeString(Random());
         if (randomRealisticUnicodeString.Length > 0)
         {
             string value = TestUtil.RandomSimpleString(Random());
             map[randomRealisticUnicodeString] = value.Length == 0 ? "a" : value;
         }
     }
     if (map.Count == 0)
     {
         map["booked"] = "books";
     }
     StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(Random().nextBoolean());
     IDictionary<string, string> entrySet = map;
     foreach (KeyValuePair<string, string> entry in entrySet)
     {
         builder.Add(entry.Key, entry.Value);
     }
     StemmerOverrideFilter.StemmerOverrideMap build = builder.Build();
     foreach (KeyValuePair<string, string> entry in entrySet)
     {
         if (Random().nextBoolean())
         {
             Tokenizer tokenizer = new KeywordTokenizer(new StringReader(entry.Key));
             TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, build));
             AssertTokenStreamContents(stream, new string[] { entry.Value });
         }
     }
 }
        /// <summary>
        /// Creates a
        /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
        /// </summary>
        /// <returns> A
        ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
        ///         built from an <seealso cref="StandardTokenizer"/> filtered with
        ///         <seealso cref="StandardFilter"/>, <seealso cref="EnglishPossessiveFilter"/>, 
        ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
        ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
        ///         provided and <seealso cref="PorterStemFilter"/>. </returns>
        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
        {
            Tokenizer source = new StandardTokenizer(matchVersion, reader);
            TokenStream result = new StandardFilter(matchVersion, source);
            // prior to this we get the classic behavior, standardfilter does it for us.
#pragma warning disable 612, 618
            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
#pragma warning restore 612, 618
            {
                result = new EnglishPossessiveFilter(matchVersion, result);
            }
            result = new LowerCaseFilter(matchVersion, result);
            result = new StopFilter(matchVersion, result, stopwords);
            if (stemExclusionSet.Any())
            {
                result = new SetKeywordMarkerFilter(result, stemExclusionSet);
            }
            result = new PorterStemFilter(result);
            return new TokenStreamComponents(source, result);
        }