Exemple #1
0
        public virtual void TestPayloadCopy()
        {
            string      s = "how now brown cow";
            TokenStream ts;

            ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(s));
            ts = new PayloadSetter(ts);
            VerifyPayload(ts);

            ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(s));
            ts = new PayloadSetter(ts);
            VerifyPayload(ts);
        }
        public virtual void TestQueryReset()
        {
            const string input = "How the s a brown s cow d like A B thing?";
            WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
            CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords);
            CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);

            ICharTermAttribute term = wt.AddAttribute<ICharTermAttribute>();
            nsf.Reset();
            assertTrue(nsf.IncrementToken());
            assertEquals("How_the", term.ToString());
            assertTrue(nsf.IncrementToken());
            assertEquals("the_s", term.ToString());
            nsf.Dispose();

            wt.Reader = new StringReader(input);
            nsf.Reset();
            assertTrue(nsf.IncrementToken());
            assertEquals("How_the", term.ToString());
        }
Exemple #3
0
        public virtual void TestLowerCaseFilterLowSurrogateLeftover()
        {
            // test if the limit of the termbuffer is correctly used with supplementary
            // chars
            WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("BogustermBogusterm\udc16"));
            LowerCaseFilter     filter    = new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer);

            AssertTokenStreamContents(filter, new string[] { "bogustermbogusterm\udc16" });
            filter.Reset();
            string highSurEndingUpper = "BogustermBoguster\ud801";
            string highSurEndingLower = "bogustermboguster\ud801";

            tokenizer.SetReader(new StringReader(highSurEndingUpper));
            AssertTokenStreamContents(filter, new string[] { highSurEndingLower });
            assertTrue(filter.HasAttribute <ICharTermAttribute>());
            char[] termBuffer = filter.GetAttribute <ICharTermAttribute>().Buffer;
            int    length     = highSurEndingLower.Length;

            assertEquals('\ud801', termBuffer[length - 1]);
        }
Exemple #4
0
            protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
            {
                Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);

                return(new TokenStreamComponents(tokenizer, new UpperCaseFilter(TEST_VERSION_CURRENT, tokenizer)));
            }
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer tokenizer = new WhitespaceTokenizer(LuceneVersion.LUCENE_CURRENT, reader);
     return new TokenStreamComponents(tokenizer, new ChineseFilter(tokenizer));
 }
        public virtual void TestRandomRealisticWhiteSpace()
        {
            IDictionary<string, string> map = new Dictionary<string, string>();
            int numTerms = AtLeast(50);
            for (int i = 0; i < numTerms; i++)
            {
                string randomRealisticUnicodeString = TestUtil.RandomRealisticUnicodeString(Random());
                char[] charArray = randomRealisticUnicodeString.ToCharArray();
                StringBuilder sb = new StringBuilder();
                for (int j = 0; j < charArray.Length;)
                {
                    int cp = Character.CodePointAt(charArray, j, charArray.Length);
                    if (!char.IsWhiteSpace((char)cp))
                    {
                        sb.Append(cp);
                    }
                    j += Character.CharCount(cp);
                }
                if (sb.Length > 0)
                {
                    string value = TestUtil.RandomSimpleString(Random());
                    map[sb.ToString()] = value.Length == 0 ? "a" : value;

                }
            }
            if (map.Count == 0)
            {
                map["booked"] = "books";
            }
            StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(Random().nextBoolean());
            IDictionary<string, string> entrySet = map;
            StringBuilder input = new StringBuilder();
            IList<string> output = new List<string>();
            foreach (KeyValuePair<string, string> entry in entrySet)
            {
                builder.Add(entry.Key, entry.Value);
                if (Random().nextBoolean() || output.Count == 0)
                {
                    input.Append(entry.Key).Append(" ");
                    output.Add(entry.Value);
                }
            }
            Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input.ToString()));
            TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build()));
            AssertTokenStreamContents(stream, output.ToArray());
        }
        public virtual void TestPayloadCopy()
        {
            string s = "how now brown cow";
            TokenStream ts;
            ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(s));
            ts = new PayloadSetter(ts);
            VerifyPayload(ts);

            ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(s));
            ts = new PayloadSetter(ts);
            VerifyPayload(ts);
        }
 public virtual void TestWhitespaceTokenizer()
 {
     StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
     WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
     AssertTokenStreamContents(tokenizer, new string[] { "Tokenizer", "\ud801\udc1ctest" });
 }
 public virtual void TestWhitespaceTokenizerBWCompat()
 {
     StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
     WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(LuceneVersion.LUCENE_30, reader);
     AssertTokenStreamContents(tokenizer, new string[] { "Tokenizer", "\ud801\udc1ctest" });
 }
 public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
 {
     Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
     return new TokenStreamComponents(tokenizer, new UpperCaseFilter(TEST_VERSION_CURRENT, tokenizer));
 }
 public virtual void TestLowerCaseFilterLowSurrogateLeftover()
 {
     // test if the limit of the termbuffer is correctly used with supplementary
     // chars
     WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("BogustermBogusterm\udc16"));
     LowerCaseFilter filter = new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer);
     AssertTokenStreamContents(filter, new string[] { "bogustermbogusterm\udc16" });
     filter.Reset();
     string highSurEndingUpper = "BogustermBoguster\ud801";
     string highSurEndingLower = "bogustermboguster\ud801";
     tokenizer.Reader = new StringReader(highSurEndingUpper);
     AssertTokenStreamContents(filter, new string[] { highSurEndingLower });
     assertTrue(filter.HasAttribute<ICharTermAttribute>());
     char[] termBuffer = filter.GetAttribute<ICharTermAttribute>().Buffer();
     int length = highSurEndingLower.Length;
     assertEquals('\ud801', termBuffer[length - 1]);
 }
 public virtual void TestReset()
 {
     Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence"));
     TokenStream filter = new ShingleFilter(wsTokenizer, 2);
     AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
     wsTokenizer.Reader = new StringReader("please divide this sentence");
     AssertTokenStreamContents(filter, new string[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new string[] { TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE, "shingle", TypeAttribute_Fields.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
 }
        public virtual void TestRetainMockAttribute()
        {
            CharArraySet dict = makeDictionary("abc", "d", "efg");
            Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcdefg"));
            TokenStream stream = new MockRetainAttributeFilter(tokenizer);
            stream = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, stream, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
            IMockRetainAttribute retAtt = stream.AddAttribute<IMockRetainAttribute>();
            stream.Reset();
            while (stream.IncrementToken())
            {
                assertTrue("Custom attribute value was lost", retAtt.Retain);
            }

        }
        public virtual void TestReset()
        {
            CharArraySet dict = makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz", "Aufgabe", "Überwachung");

            Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("Rindfleischüberwachungsgesetz"));
            DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, wsTokenizer, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

            ICharTermAttribute termAtt = tf.GetAttribute<ICharTermAttribute>();
            tf.Reset();
            assertTrue(tf.IncrementToken());
            assertEquals("Rindfleischüberwachungsgesetz", termAtt.ToString());
            assertTrue(tf.IncrementToken());
            assertEquals("Rind", termAtt.ToString());
            tf.End();
            tf.Dispose();
            wsTokenizer.Reader = new StringReader("Rindfleischüberwachungsgesetz");
            tf.Reset();
            assertTrue(tf.IncrementToken());
            assertEquals("Rindfleischüberwachungsgesetz", termAtt.ToString());
        }
 public virtual void TestReset()
 {
     WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
     #pragma warning disable 612, 618
     EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
     #pragma warning restore 612, 618
     AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 });
     tokenizer.Reader = new StringReader("abcde");
     AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 });
 }
 public virtual void TestReset()
 {
     WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
     NGramTokenFilter filter = new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 1, 1);
     AssertTokenStreamContents(filter, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5 }, new int[] { 1, 0, 0, 0, 0 });
     tokenizer.Reader = new StringReader("abcde");
     AssertTokenStreamContents(filter, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 0, 0, 0, 0 }, new int[] { 5, 5, 5, 5, 5 }, new int[] { 1, 0, 0, 0, 0 });
 }