public virtual void TestNoOverrides() { StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true); Tokenizer tokenizer = new KeywordTokenizer(new StringReader("book")); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build())); AssertTokenStreamContents(stream, new string[] { "book" }); }
public virtual void TestIgnoreCase() { // lets make booked stem to books // the override filter will convert "booked" to "books", // but also mark it with KeywordAttribute so Porter will not change it. StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true); builder.Add("boOkEd", "books"); Tokenizer tokenizer = new KeywordTokenizer(new StringReader("BooKeD")); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build())); AssertTokenStreamContents(stream, new string[] { "books" }); }
public virtual void TestRandomStrings() { for (int i = 0; i < 10000; i++) { string text = TestUtil.RandomUnicodeString(Random(), 100); int min = TestUtil.NextInt(Random(), 0, 100); int max = TestUtil.NextInt(Random(), 0, 100); int count = Character.CodePointCount(text, 0, text.Length);// text.codePointCount(0, text.Length); if (min > max) { int temp = min; min = max; max = temp; } bool expected = count >= min && count <= max; TokenStream stream = new KeywordTokenizer(new StringReader(text)); stream = new CodepointCountFilter(TEST_VERSION_CURRENT, stream, min, max); stream.Reset(); assertEquals(expected, stream.IncrementToken()); stream.End(); stream.Dispose(); } }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); bool updateOffsets = Random().nextBoolean(); LuceneVersion version = updateOffsets ? LuceneVersion.LUCENE_43 : TEST_VERSION_CURRENT; return new TokenStreamComponents(tokenizer, new TrimFilter(version, tokenizer, updateOffsets)); }
public virtual void TestRandomRealisticKeyword() { IDictionary<string, string> map = new Dictionary<string, string>(); int numTerms = AtLeast(50); for (int i = 0; i < numTerms; i++) { string randomRealisticUnicodeString = TestUtil.RandomRealisticUnicodeString(Random()); if (randomRealisticUnicodeString.Length > 0) { string value = TestUtil.RandomSimpleString(Random()); map[randomRealisticUnicodeString] = value.Length == 0 ? "a" : value; } } if (map.Count == 0) { map["booked"] = "books"; } StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(Random().nextBoolean()); IDictionary<string, string> entrySet = map; foreach (KeyValuePair<string, string> entry in entrySet) { builder.Add(entry.Key, entry.Value); } StemmerOverrideFilter.StemmerOverrideMap build = builder.Build(); foreach (KeyValuePair<string, string> entry in entrySet) { if (Random().nextBoolean()) { Tokenizer tokenizer = new KeywordTokenizer(new StringReader(entry.Key)); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, build)); AssertTokenStreamContents(stream, new string[] { entry.Value }); } } }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new LengthFilter(TEST_VERSION_CURRENT, tokenizer, 0, 5)); }
public override Analyzer.TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new Analyzer.TokenStreamComponents(tokenizer, new PortugueseLightStemFilter(tokenizer)); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer t = new KeywordTokenizer(reader); return new TokenStreamComponents(t, new GermanStemFilter(new LowerCaseFilter(TEST_VERSION_CURRENT, t))); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new EnglishMinimalStemFilter(tokenizer)); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new Analyzer.TokenStreamComponents(tokenizer, new SnowballFilter(tokenizer, lang)); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new NorwegianLightStemFilter(tokenizer)); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new TurkishLowerCaseFilter(tokenizer)); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new ScandinavianFoldingFilter(tokenizer)); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new PatternReplaceFilter(tokenizer, new Regex("a", RegexOptions.Compiled), "b", true)); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); TokenFilter filter = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, hyphenator); return new TokenStreamComponents(tokenizer, filter); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, tokenizer, dict)); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); #pragma warning disable 612, 618 return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15)); #pragma warning restore 612, 618 }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); #pragma warning disable 612, 618 return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 2, 15)); #pragma warning restore 612, 618 }
public virtual void TestSupplementaryCharacters() { string s = TestUtil.RandomUnicodeString(Random(), 10); int codePointCount = Character.CodePointCount(s, 0, s.Length); int minGram = TestUtil.NextInt(Random(), 1, 3); int maxGram = TestUtil.NextInt(Random(), minGram, 10); TokenStream tk = new KeywordTokenizer(new StringReader(s)); tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram); ICharTermAttribute termAtt = tk.AddAttribute<ICharTermAttribute>(); IOffsetAttribute offsetAtt = tk.AddAttribute<IOffsetAttribute>(); tk.Reset(); for (int i = minGram; i <= Math.Min(codePointCount, maxGram); ++i) { assertTrue(tk.IncrementToken()); assertEquals(0, offsetAtt.StartOffset()); assertEquals(s.Length, offsetAtt.EndOffset()); int end = Character.OffsetByCodePoints(s, 0, i); assertEquals(s.Substring(0, end), termAtt.ToString()); } assertFalse(tk.IncrementToken()); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase)); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new ASCIIFoldingFilter(tokenizer, Random().nextBoolean())); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(TEST_VERSION_CURRENT, tokenizer, flags, protectedWords)); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new HyphenatedWordsFilter(tokenizer)); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, dictionary)); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer t = new KeywordTokenizer(reader); return new TokenStreamComponents(t, new SnowballFilter(t, snowballLanguage)); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new CapitalizationFilter(tokenizer)); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new KeywordTokenizer(reader); return new TokenStreamComponents(tokenizer, new ElisionFilter(tokenizer, FrenchAnalyzer.DEFAULT_ARTICLES)); }