public virtual void TestPayloadCopy() { string s = "how now brown cow"; TokenStream ts; ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(s)); ts = new PayloadSetter(ts); VerifyPayload(ts); ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(s)); ts = new PayloadSetter(ts); VerifyPayload(ts); }
public virtual void TestQueryReset() { const string input = "How the s a brown s cow d like A B thing?"; WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords); CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf); ICharTermAttribute term = wt.AddAttribute<ICharTermAttribute>(); nsf.Reset(); assertTrue(nsf.IncrementToken()); assertEquals("How_the", term.ToString()); assertTrue(nsf.IncrementToken()); assertEquals("the_s", term.ToString()); nsf.Dispose(); wt.Reader = new StringReader(input); nsf.Reset(); assertTrue(nsf.IncrementToken()); assertEquals("How_the", term.ToString()); }
public virtual void TestLowerCaseFilterLowSurrogateLeftover() { // test if the limit of the termbuffer is correctly used with supplementary // chars WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("BogustermBogusterm\udc16")); LowerCaseFilter filter = new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer); AssertTokenStreamContents(filter, new string[] { "bogustermbogusterm\udc16" }); filter.Reset(); string highSurEndingUpper = "BogustermBoguster\ud801"; string highSurEndingLower = "bogustermboguster\ud801"; tokenizer.SetReader(new StringReader(highSurEndingUpper)); AssertTokenStreamContents(filter, new string[] { highSurEndingLower }); assertTrue(filter.HasAttribute <ICharTermAttribute>()); char[] termBuffer = filter.GetAttribute <ICharTermAttribute>().Buffer; int length = highSurEndingLower.Length; assertEquals('\ud801', termBuffer[length - 1]); }
protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); return(new TokenStreamComponents(tokenizer, new UpperCaseFilter(TEST_VERSION_CURRENT, tokenizer))); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new WhitespaceTokenizer(LuceneVersion.LUCENE_CURRENT, reader); return new TokenStreamComponents(tokenizer, new ChineseFilter(tokenizer)); }
public virtual void TestRandomRealisticWhiteSpace() { IDictionary<string, string> map = new Dictionary<string, string>(); int numTerms = AtLeast(50); for (int i = 0; i < numTerms; i++) { string randomRealisticUnicodeString = TestUtil.RandomRealisticUnicodeString(Random()); char[] charArray = randomRealisticUnicodeString.ToCharArray(); StringBuilder sb = new StringBuilder(); for (int j = 0; j < charArray.Length;) { int cp = Character.CodePointAt(charArray, j, charArray.Length); if (!char.IsWhiteSpace((char)cp)) { sb.Append(cp); } j += Character.CharCount(cp); } if (sb.Length > 0) { string value = TestUtil.RandomSimpleString(Random()); map[sb.ToString()] = value.Length == 0 ? "a" : value; } } if (map.Count == 0) { map["booked"] = "books"; } StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(Random().nextBoolean()); IDictionary<string, string> entrySet = map; StringBuilder input = new StringBuilder(); IList<string> output = new List<string>(); foreach (KeyValuePair<string, string> entry in entrySet) { builder.Add(entry.Key, entry.Value); if (Random().nextBoolean() || output.Count == 0) { input.Append(entry.Key).Append(" "); output.Add(entry.Value); } } Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input.ToString())); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build())); AssertTokenStreamContents(stream, output.ToArray()); }
public virtual void TestPayloadCopy() { string s = "how now brown cow"; TokenStream ts; ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(s)); ts = new PayloadSetter(ts); VerifyPayload(ts); ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(s)); ts = new PayloadSetter(ts); VerifyPayload(ts); }
public virtual void TestWhitespaceTokenizer() { StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); AssertTokenStreamContents(tokenizer, new string[] { "Tokenizer", "\ud801\udc1ctest" }); }
public virtual void TestWhitespaceTokenizerBWCompat() { StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(LuceneVersion.LUCENE_30, reader); AssertTokenStreamContents(tokenizer, new string[] { "Tokenizer", "\ud801\udc1ctest" }); }
public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); return new TokenStreamComponents(tokenizer, new UpperCaseFilter(TEST_VERSION_CURRENT, tokenizer)); }
public virtual void TestLowerCaseFilterLowSurrogateLeftover() { // test if the limit of the termbuffer is correctly used with supplementary // chars WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("BogustermBogusterm\udc16")); LowerCaseFilter filter = new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer); AssertTokenStreamContents(filter, new string[] { "bogustermbogusterm\udc16" }); filter.Reset(); string highSurEndingUpper = "BogustermBoguster\ud801"; string highSurEndingLower = "bogustermboguster\ud801"; tokenizer.Reader = new StringReader(highSurEndingUpper); AssertTokenStreamContents(filter, new string[] { highSurEndingLower }); assertTrue(filter.HasAttribute<ICharTermAttribute>()); char[] termBuffer = filter.GetAttribute<ICharTermAttribute>().Buffer(); int length = highSurEndingLower.Length; assertEquals('\ud801', termBuffer[length - 1]); }
public virtual void TestReset() { Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence")); TokenStream filter = new ShingleFilter(wsTokenizer, 2); AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); wsTokenizer.Reader = new StringReader("please divide this sentence"); AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 }); }
public virtual void TestRetainMockAttribute() { CharArraySet dict = makeDictionary("abc", "d", "efg"); Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcdefg")); TokenStream stream = new MockRetainAttributeFilter(tokenizer); stream = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, stream, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); IMockRetainAttribute retAtt = stream.AddAttribute<IMockRetainAttribute>(); stream.Reset(); while (stream.IncrementToken()) { assertTrue("Custom attribute value was lost", retAtt.Retain); } }
public virtual void TestReset() { CharArraySet dict = makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz", "Aufgabe", "Überwachung"); Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("Rindfleischüberwachungsgesetz")); DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, wsTokenizer, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); ICharTermAttribute termAtt = tf.GetAttribute<ICharTermAttribute>(); tf.Reset(); assertTrue(tf.IncrementToken()); assertEquals("Rindfleischüberwachungsgesetz", termAtt.ToString()); assertTrue(tf.IncrementToken()); assertEquals("Rind", termAtt.ToString()); tf.End(); tf.Dispose(); wsTokenizer.Reader = new StringReader("Rindfleischüberwachungsgesetz"); tf.Reset(); assertTrue(tf.IncrementToken()); assertEquals("Rindfleischüberwachungsgesetz", termAtt.ToString()); }
public virtual void TestReset() { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde")); #pragma warning disable 612, 618 EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3); #pragma warning restore 612, 618 AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 }); tokenizer.Reader = new StringReader("abcde"); AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 }); }
public virtual void TestReset() { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde")); NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 1, 1); AssertTokenStreamContents(filter, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5 }, new int[] { 1, 0, 0, 0, 0 }); tokenizer.Reader = new StringReader("abcde"); AssertTokenStreamContents(filter, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5 }, new int[] { 1, 0, 0, 0, 0 }); }