public virtual void TestNoOverrides() { StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true); Tokenizer tokenizer = new KeywordTokenizer(new StringReader("book")); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build())); AssertTokenStreamContents(stream, new string[] { "book" }); }
public virtual void TestIgnoreCase() { // lets make booked stem to books // the override filter will convert "booked" to "books", // but also mark it with KeywordAttribute so Porter will not change it. StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true); builder.Add("boOkEd", "books"); Tokenizer tokenizer = new KeywordTokenizer(new StringReader("BooKeD")); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build())); AssertTokenStreamContents(stream, new string[] { "books" }); }
public virtual void TestRandomRealisticWhiteSpace() { IDictionary <string, string> map = new Dictionary <string, string>(); int numTerms = AtLeast(50); for (int i = 0; i < numTerms; i++) { string randomRealisticUnicodeString = TestUtil.RandomRealisticUnicodeString(Random); char[] charArray = randomRealisticUnicodeString.ToCharArray(); StringBuilder sb = new StringBuilder(); for (int j = 0; j < charArray.Length;) { int cp = Character.CodePointAt(charArray, j, charArray.Length); if (!char.IsWhiteSpace((char)cp)) { sb.AppendCodePoint(cp); } j += Character.CharCount(cp); } if (sb.Length > 0) { string value = TestUtil.RandomSimpleString(Random); map[sb.ToString()] = value.Length == 0 ? "a" : value; } } if (map.Count == 0) { map["booked"] = "books"; } StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(Random.nextBoolean()); IDictionary <string, string> entrySet = map; StringBuilder input = new StringBuilder(); IList <string> output = new List <string>(); foreach (KeyValuePair <string, string> entry in entrySet) { builder.Add(entry.Key, entry.Value); if (Random.nextBoolean() || output.Count == 0) { input.Append(entry.Key).Append(" "); output.Add(entry.Value); } } Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input.ToString())); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build())); AssertTokenStreamContents(stream, output.ToArray()); }
public virtual void Inform(IResourceLoader loader) { if (dictionaryFiles != null) { AssureMatchVersion(); IList <string> files = SplitFileNames(dictionaryFiles); if (files.Count() > 0) { StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(ignoreCase); foreach (string file in files) { IList <string> list = GetLines(loader, file.Trim()); foreach (string line in list) { string[] mapping = new Regex("\t").Split(line, 2); builder.Add(mapping[0], mapping[1]); } } dictionary = builder.Build(); } } }
public virtual void Inform(IResourceLoader loader) { if (dictionaryFiles != null) { AssureMatchVersion(); IEnumerable<string> files = SplitFileNames(dictionaryFiles); if (files.Count() > 0) { StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(ignoreCase); foreach (string file in files) { IEnumerable<string> list = GetLines(loader, file.Trim()); foreach (string line in list) { string[] mapping = new Regex("\t").Split(line, 2); builder.Add(mapping[0], mapping[1]); } } dictionary = builder.Build(); } } }
public virtual void TestRandomRealisticKeyword() { IDictionary <string, string> map = new Dictionary <string, string>(); int numTerms = AtLeast(50); for (int i = 0; i < numTerms; i++) { string randomRealisticUnicodeString = TestUtil.RandomRealisticUnicodeString(Random); if (randomRealisticUnicodeString.Length > 0) { string value = TestUtil.RandomSimpleString(Random); map[randomRealisticUnicodeString] = value.Length == 0 ? "a" : value; } } if (map.Count == 0) { map["booked"] = "books"; } StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(Random.nextBoolean()); IDictionary <string, string> entrySet = map; foreach (KeyValuePair <string, string> entry in entrySet) { builder.Add(entry.Key, entry.Value); } StemmerOverrideFilter.StemmerOverrideMap build = builder.Build(); foreach (KeyValuePair <string, string> entry in entrySet) { if (Random.nextBoolean()) { Tokenizer tokenizer = new KeywordTokenizer(new StringReader(entry.Key)); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, build)); AssertTokenStreamContents(stream, new string[] { entry.Value }); } } }
public virtual void TestRandomRealisticWhiteSpace() { IDictionary<string, string> map = new Dictionary<string, string>(); int numTerms = AtLeast(50); for (int i = 0; i < numTerms; i++) { string randomRealisticUnicodeString = TestUtil.RandomRealisticUnicodeString(Random()); char[] charArray = randomRealisticUnicodeString.ToCharArray(); StringBuilder sb = new StringBuilder(); for (int j = 0; j < charArray.Length;) { int cp = Character.CodePointAt(charArray, j, charArray.Length); if (!char.IsWhiteSpace((char)cp)) { sb.Append(cp); } j += Character.CharCount(cp); } if (sb.Length > 0) { string value = TestUtil.RandomSimpleString(Random()); map[sb.ToString()] = value.Length == 0 ? "a" : value; } } if (map.Count == 0) { map["booked"] = "books"; } StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(Random().nextBoolean()); IDictionary<string, string> entrySet = map; StringBuilder input = new StringBuilder(); IList<string> output = new List<string>(); foreach (KeyValuePair<string, string> entry in entrySet) { builder.Add(entry.Key, entry.Value); if (Random().nextBoolean() || output.Count == 0) { input.Append(entry.Key).Append(" "); output.Add(entry.Value); } } Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input.ToString())); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build())); AssertTokenStreamContents(stream, output.ToArray()); }
public virtual void TestRandomRealisticKeyword() { IDictionary<string, string> map = new Dictionary<string, string>(); int numTerms = AtLeast(50); for (int i = 0; i < numTerms; i++) { string randomRealisticUnicodeString = TestUtil.RandomRealisticUnicodeString(Random()); if (randomRealisticUnicodeString.Length > 0) { string value = TestUtil.RandomSimpleString(Random()); map[randomRealisticUnicodeString] = value.Length == 0 ? "a" : value; } } if (map.Count == 0) { map["booked"] = "books"; } StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(Random().nextBoolean()); IDictionary<string, string> entrySet = map; foreach (KeyValuePair<string, string> entry in entrySet) { builder.Add(entry.Key, entry.Value); } StemmerOverrideFilter.StemmerOverrideMap build = builder.Build(); foreach (KeyValuePair<string, string> entry in entrySet) { if (Random().nextBoolean()) { Tokenizer tokenizer = new KeywordTokenizer(new StringReader(entry.Key)); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, build)); AssertTokenStreamContents(stream, new string[] { entry.Value }); } } }
public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<string> stemOverrideDict) { this.matchVersion = matchVersion; this.stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stopwords)); this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionTable)); #pragma warning disable 612, 618 if (stemOverrideDict.Count == 0 || !matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) #pragma warning restore 612, 618 { this.stemdict = null; this.origStemdict = CharArrayMap.UnmodifiableMap(CharArrayMap.Copy(matchVersion, stemOverrideDict)); } else { this.origStemdict = null; // we don't need to ignore case here since we lowercase in this analyzer anyway StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false); CharArrayMap<string>.EntryIterator iter = (CharArrayMap<string>.EntryIterator)stemOverrideDict.EntrySet().GetEnumerator(); CharsRef spare = new CharsRef(); while (iter.HasNext) { char[] nextKey = iter.NextKey(); spare.CopyChars(nextKey, 0, nextKey.Length); builder.Add(new string(spare.Chars), iter.CurrentValue); } try { this.stemdict = builder.Build(); } catch (IOException ex) { throw new Exception("can not build stem dict", ex); } } }