This builder builds an FST for the StemmerOverrideFilter
 public virtual void TestNoOverrides()
 {
     StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true);
     Tokenizer tokenizer = new KeywordTokenizer(new StringReader("book"));
     TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build()));
     AssertTokenStreamContents(stream, new string[] { "book" });
 }
        public virtual void TestNoOverrides()
        {
            StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true);
            Tokenizer   tokenizer = new KeywordTokenizer(new StringReader("book"));
            TokenStream stream    = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build()));

            AssertTokenStreamContents(stream, new string[] { "book" });
        }
 public virtual void TestIgnoreCase()
 {
     // lets make booked stem to books
     // the override filter will convert "booked" to "books",
     // but also mark it with KeywordAttribute so Porter will not change it.
     StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true);
     builder.Add("boOkEd", "books");
     Tokenizer tokenizer = new KeywordTokenizer(new StringReader("BooKeD"));
     TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build()));
     AssertTokenStreamContents(stream, new string[] { "books" });
 }
        public virtual void TestIgnoreCase()
        {
            // lets make booked stem to books
            // the override filter will convert "booked" to "books",
            // but also mark it with KeywordAttribute so Porter will not change it.
            StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true);
            builder.Add("boOkEd", "books");
            Tokenizer   tokenizer = new KeywordTokenizer(new StringReader("BooKeD"));
            TokenStream stream    = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build()));

            AssertTokenStreamContents(stream, new string[] { "books" });
        }
        public virtual void TestRandomRealisticWhiteSpace()
        {
            IDictionary <string, string> map = new Dictionary <string, string>();
            int numTerms = AtLeast(50);

            for (int i = 0; i < numTerms; i++)
            {
                string        randomRealisticUnicodeString = TestUtil.RandomRealisticUnicodeString(Random);
                char[]        charArray = randomRealisticUnicodeString.ToCharArray();
                StringBuilder sb        = new StringBuilder();
                for (int j = 0; j < charArray.Length;)
                {
                    int cp = Character.CodePointAt(charArray, j, charArray.Length);
                    if (!char.IsWhiteSpace((char)cp))
                    {
                        sb.AppendCodePoint(cp);
                    }
                    j += Character.CharCount(cp);
                }
                if (sb.Length > 0)
                {
                    string value = TestUtil.RandomSimpleString(Random);
                    map[sb.ToString()] = value.Length == 0 ? "a" : value;
                }
            }
            if (map.Count == 0)
            {
                map["booked"] = "books";
            }
            StemmerOverrideFilter.Builder builder  = new StemmerOverrideFilter.Builder(Random.nextBoolean());
            IDictionary <string, string>  entrySet = map;
            StringBuilder  input  = new StringBuilder();
            IList <string> output = new List <string>();

            foreach (KeyValuePair <string, string> entry in entrySet)
            {
                builder.Add(entry.Key, entry.Value);
                if (Random.nextBoolean() || output.Count == 0)
                {
                    input.Append(entry.Key).Append(" ");
                    output.Add(entry.Value);
                }
            }
            Tokenizer   tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input.ToString()));
            TokenStream stream    = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build()));

            AssertTokenStreamContents(stream, output.ToArray());
        }
 public virtual void Inform(IResourceLoader loader)
 {
     if (dictionaryFiles != null)
     {
         AssureMatchVersion();
         IList <string> files = SplitFileNames(dictionaryFiles);
         if (files.Count() > 0)
         {
             StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(ignoreCase);
             foreach (string file in files)
             {
                 IList <string> list = GetLines(loader, file.Trim());
                 foreach (string line in list)
                 {
                     string[] mapping = new Regex("\t").Split(line, 2);
                     builder.Add(mapping[0], mapping[1]);
                 }
             }
             dictionary = builder.Build();
         }
     }
 }
 public virtual void Inform(IResourceLoader loader)
 {
     if (dictionaryFiles != null)
     {
         AssureMatchVersion();
         IEnumerable<string> files = SplitFileNames(dictionaryFiles);
         if (files.Count() > 0)
         {
             StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(ignoreCase);
             foreach (string file in files)
             {
                 IEnumerable<string> list = GetLines(loader, file.Trim());
                 foreach (string line in list)
                 {
                     string[] mapping = new Regex("\t").Split(line, 2);
                     builder.Add(mapping[0], mapping[1]);
                 }
             }
             dictionary = builder.Build();
         }
     }
 }
        public virtual void TestRandomRealisticKeyword()
        {
            IDictionary <string, string> map = new Dictionary <string, string>();
            int numTerms = AtLeast(50);

            for (int i = 0; i < numTerms; i++)
            {
                string randomRealisticUnicodeString = TestUtil.RandomRealisticUnicodeString(Random);
                if (randomRealisticUnicodeString.Length > 0)
                {
                    string value = TestUtil.RandomSimpleString(Random);
                    map[randomRealisticUnicodeString] = value.Length == 0 ? "a" : value;
                }
            }
            if (map.Count == 0)
            {
                map["booked"] = "books";
            }
            StemmerOverrideFilter.Builder builder  = new StemmerOverrideFilter.Builder(Random.nextBoolean());
            IDictionary <string, string>  entrySet = map;

            foreach (KeyValuePair <string, string> entry in entrySet)
            {
                builder.Add(entry.Key, entry.Value);
            }
            StemmerOverrideFilter.StemmerOverrideMap build = builder.Build();
            foreach (KeyValuePair <string, string> entry in entrySet)
            {
                if (Random.nextBoolean())
                {
                    Tokenizer   tokenizer = new KeywordTokenizer(new StringReader(entry.Key));
                    TokenStream stream    = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, build));
                    AssertTokenStreamContents(stream, new string[] { entry.Value });
                }
            }
        }
        public virtual void TestRandomRealisticWhiteSpace()
        {
            IDictionary<string, string> map = new Dictionary<string, string>();
            int numTerms = AtLeast(50);
            for (int i = 0; i < numTerms; i++)
            {
                string randomRealisticUnicodeString = TestUtil.RandomRealisticUnicodeString(Random());
                char[] charArray = randomRealisticUnicodeString.ToCharArray();
                StringBuilder sb = new StringBuilder();
                for (int j = 0; j < charArray.Length;)
                {
                    int cp = Character.CodePointAt(charArray, j, charArray.Length);
                    if (!char.IsWhiteSpace((char)cp))
                    {
                        sb.Append(cp);
                    }
                    j += Character.CharCount(cp);
                }
                if (sb.Length > 0)
                {
                    string value = TestUtil.RandomSimpleString(Random());
                    map[sb.ToString()] = value.Length == 0 ? "a" : value;

                }
            }
            if (map.Count == 0)
            {
                map["booked"] = "books";
            }
            StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(Random().nextBoolean());
            IDictionary<string, string> entrySet = map;
            StringBuilder input = new StringBuilder();
            IList<string> output = new List<string>();
            foreach (KeyValuePair<string, string> entry in entrySet)
            {
                builder.Add(entry.Key, entry.Value);
                if (Random().nextBoolean() || output.Count == 0)
                {
                    input.Append(entry.Key).Append(" ");
                    output.Add(entry.Value);
                }
            }
            Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input.ToString()));
            TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build()));
            AssertTokenStreamContents(stream, output.ToArray());
        }
 public virtual void TestRandomRealisticKeyword()
 {
     IDictionary<string, string> map = new Dictionary<string, string>();
     int numTerms = AtLeast(50);
     for (int i = 0; i < numTerms; i++)
     {
         string randomRealisticUnicodeString = TestUtil.RandomRealisticUnicodeString(Random());
         if (randomRealisticUnicodeString.Length > 0)
         {
             string value = TestUtil.RandomSimpleString(Random());
             map[randomRealisticUnicodeString] = value.Length == 0 ? "a" : value;
         }
     }
     if (map.Count == 0)
     {
         map["booked"] = "books";
     }
     StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(Random().nextBoolean());
     IDictionary<string, string> entrySet = map;
     foreach (KeyValuePair<string, string> entry in entrySet)
     {
         builder.Add(entry.Key, entry.Value);
     }
     StemmerOverrideFilter.StemmerOverrideMap build = builder.Build();
     foreach (KeyValuePair<string, string> entry in entrySet)
     {
         if (Random().nextBoolean())
         {
             Tokenizer tokenizer = new KeywordTokenizer(new StringReader(entry.Key));
             TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, build));
             AssertTokenStreamContents(stream, new string[] { entry.Value });
         }
     }
 }
Пример #11
0
 public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<string> stemOverrideDict)
 {
     this.matchVersion = matchVersion;
     this.stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stopwords));
     this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionTable));
     #pragma warning disable 612, 618
     if (stemOverrideDict.Count == 0 || !matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
     #pragma warning restore 612, 618
     {
         this.stemdict = null;
         this.origStemdict = CharArrayMap.UnmodifiableMap(CharArrayMap.Copy(matchVersion, stemOverrideDict));
     }
     else
     {
         this.origStemdict = null;
         // we don't need to ignore case here since we lowercase in this analyzer anyway
         StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
         CharArrayMap<string>.EntryIterator iter = (CharArrayMap<string>.EntryIterator)stemOverrideDict.EntrySet().GetEnumerator();
         CharsRef spare = new CharsRef();
         while (iter.HasNext)
         {
             char[] nextKey = iter.NextKey();
             spare.CopyChars(nextKey, 0, nextKey.Length);
             builder.Add(new string(spare.Chars), iter.CurrentValue);
         }
         try
         {
             this.stemdict = builder.Build();
         }
         catch (IOException ex)
         {
             throw new Exception("can not build stem dict", ex);
         }
     }
 }