NewSinkTokenStream() public method

Returns a new SinkTokenStream that receives all tokens consumed by this stream.
public NewSinkTokenStream ( ) : SinkTokenStream
return SinkTokenStream
Ejemplo n.º 1
0
        public virtual void TestMultipleSources()
        {
            TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.ToString()), MockTokenizer.WHITESPACE, false));

            TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.NewSinkTokenStream(dogFilter);
            TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.NewSinkTokenStream(theFilter);
            tee1.Reset();
            TokenStream source1 = new CachingTokenFilter(tee1);

            tee1.AddAttribute <ICheckClearAttributesAttribute>();
            dogDetector.AddAttribute <ICheckClearAttributesAttribute>();
            theDetector.AddAttribute <ICheckClearAttributesAttribute>();

            TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.ToString()), MockTokenizer.WHITESPACE, false));

            tee2.AddSinkTokenStream(dogDetector);
            tee2.AddSinkTokenStream(theDetector);
            TokenStream source2 = tee2;

            AssertTokenStreamContents(source1, tokens1);
            AssertTokenStreamContents(source2, tokens2);

            AssertTokenStreamContents(theDetector, new string[] { "The", "the", "The", "the" });
            AssertTokenStreamContents(dogDetector, new string[] { "Dogs", "Dogs" });

            source1.Reset();
            TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1);

            string[] lowerCaseTokens = new string[tokens1.Length];
            for (int i = 0; i < tokens1.Length; i++)
            {
                lowerCaseTokens[i] = CultureInfo.InvariantCulture.TextInfo.ToLower(tokens1[i]);
            }
            AssertTokenStreamContents(lowerCasing, lowerCaseTokens);
        }
Ejemplo n.º 2
0
        public virtual void TestGeneral()
        {
            TeeSinkTokenFilter source = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.ToString()), MockTokenizer.WHITESPACE, false));
            TokenStream        sink1  = source.NewSinkTokenStream();
            TokenStream        sink2  = source.NewSinkTokenStream(theFilter);

            source.AddAttribute <ICheckClearAttributesAttribute>();
            sink1.AddAttribute <ICheckClearAttributesAttribute>();
            sink2.AddAttribute <ICheckClearAttributesAttribute>();

            AssertTokenStreamContents(source, tokens1);
            AssertTokenStreamContents(sink1, tokens1);
            AssertTokenStreamContents(sink2, new string[] { "The", "the" });
        }
Ejemplo n.º 3
0
        public virtual void Test()
        {
            TokenRangeSinkFilter sinkFilter = new TokenRangeSinkFilter(2, 4);
            string             test         = "The quick red fox jumped over the lazy brown dogs";
            TeeSinkTokenFilter tee          = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false));

            TeeSinkTokenFilter.SinkTokenStream rangeToks = tee.NewSinkTokenStream(sinkFilter);

            int count = 0;

            tee.Reset();
            while (tee.IncrementToken())
            {
                count++;
            }

            int sinkCount = 0;

            rangeToks.Reset();
            while (rangeToks.IncrementToken())
            {
                sinkCount++;
            }

            assertTrue(count + " does not equal: " + 10, count == 10);
            assertTrue("rangeToks Size: " + sinkCount + " is not: " + 2, sinkCount == 2);
        }
        public virtual void TestLooseDateFormat()
        {
            DateRecognizerSinkFilter sinkFilter = new DateRecognizerSinkFilter(CultureInfo.InvariantCulture);
            string             test             = "The quick red fox jumped over the lazy brown dogs on 7/11/2006  The dogs finally reacted on 7/2/2006";
            TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false));

            TeeSinkTokenFilter.SinkTokenStream sink = tee.NewSinkTokenStream(sinkFilter);
            int count = 0;

            tee.Reset();
            while (tee.IncrementToken())
            {
                count++;
            }
            assertTrue(count + " does not equal: " + 18, count == 18);

            int sinkCount = 0;

            sink.Reset();
            while (sink.IncrementToken())
            {
                sinkCount++;
            }
            assertTrue("sink Size: " + sinkCount + " is not: " + 2, sinkCount == 2);
        }
Ejemplo n.º 5
0
        public virtual void TestEndOffsetPositionWithTeeSinkTokenFilter()
        {
            Store.Directory    dir         = NewDirectory();
            Analyzer           analyzer    = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false);
            IndexWriter        w           = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
            Document           doc         = new Document();
            TokenStream        tokenStream = analyzer.GetTokenStream("field", "abcd   ");
            TeeSinkTokenFilter tee         = new TeeSinkTokenFilter(tokenStream);
            TokenStream        sink        = tee.NewSinkTokenStream();
            FieldType          ft          = new FieldType(TextField.TYPE_NOT_STORED);

            ft.StoreTermVectors         = true;
            ft.StoreTermVectorOffsets   = true;
            ft.StoreTermVectorPositions = true;
            Field f1 = new Field("field", tee, ft);
            Field f2 = new Field("field", sink, ft);

            doc.Add(f1);
            doc.Add(f2);
            w.AddDocument(doc);
            w.Dispose();

            IndexReader r      = DirectoryReader.Open(dir);
            Terms       vector = r.GetTermVectors(0).GetTerms("field");

            assertEquals(1, vector.Count);
            TermsEnum termsEnum = vector.GetIterator(null);

            termsEnum.Next();
            assertEquals(2, termsEnum.TotalTermFreq);
            DocsAndPositionsEnum positions = termsEnum.DocsAndPositions(null, null);

            assertTrue(positions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            assertEquals(2, positions.Freq);
            positions.NextPosition();
            assertEquals(0, positions.StartOffset);
            assertEquals(4, positions.EndOffset);
            positions.NextPosition();
            assertEquals(8, positions.StartOffset);
            assertEquals(12, positions.EndOffset);
            assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.NextDoc());
            r.Dispose();
            dir.Dispose();
        }
Ejemplo n.º 6
0
        public virtual void Test()
        {
            TokenTypeSinkFilter sinkFilter = new TokenTypeSinkFilter("D");
            string test = "The quick red fox jumped over the lazy brown dogs";

            TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false)));

            TeeSinkTokenFilter.SinkTokenStream sink = ttf.NewSinkTokenStream(sinkFilter);

            bool seenDogs = false;

            ICharTermAttribute termAtt = ttf.AddAttribute <ICharTermAttribute>();
            ITypeAttribute     typeAtt = ttf.AddAttribute <ITypeAttribute>();

            ttf.Reset();
            while (ttf.IncrementToken())
            {
                if (termAtt.ToString().Equals("dogs", StringComparison.Ordinal))
                {
                    seenDogs = true;
                    assertTrue(typeAtt.Type + " is not equal to " + "D", typeAtt.Type.Equals("D", StringComparison.Ordinal) == true);
                }
                else
                {
                    assertTrue(typeAtt.Type + " is not null and it should be", typeAtt.Type.Equals("word", StringComparison.Ordinal));
                }
            }
            assertTrue(seenDogs + " does not equal: " + true, seenDogs == true);

            int sinkCount = 0;

            sink.Reset();
            while (sink.IncrementToken())
            {
                sinkCount++;
            }

            assertTrue("sink Size: " + sinkCount + " is not: " + 1, sinkCount == 1);
        }
        public virtual void Test()
        {
            TokenTypeSinkFilter sinkFilter = new TokenTypeSinkFilter("D");
            string test = "The quick red fox jumped over the lazy brown dogs";

            TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(this, new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false)));
            TeeSinkTokenFilter.SinkTokenStream sink = ttf.NewSinkTokenStream(sinkFilter);

            bool seenDogs = false;

            ICharTermAttribute termAtt = ttf.AddAttribute<ICharTermAttribute>();
            ITypeAttribute typeAtt = ttf.AddAttribute<ITypeAttribute>();
            ttf.Reset();
            while (ttf.IncrementToken())
            {
                if (termAtt.ToString().Equals("dogs"))
                {
                    seenDogs = true;
                    assertTrue(typeAtt.Type + " is not equal to " + "D", typeAtt.Type.Equals("D") == true);
                }
                else
                {
                    assertTrue(typeAtt.Type + " is not null and it should be", typeAtt.Type.Equals("word"));
                }
            }
            assertTrue(seenDogs + " does not equal: " + true, seenDogs == true);

            int sinkCount = 0;
            sink.Reset();
            while (sink.IncrementToken())
            {
                sinkCount++;
            }

            assertTrue("sink Size: " + sinkCount + " is not: " + 1, sinkCount == 1);
        }
        public virtual void TestLooseDateFormat()
        {
            DateRecognizerSinkFilter sinkFilter = new DateRecognizerSinkFilter(CultureInfo.InvariantCulture);
            string test = "The quick red fox jumped over the lazy brown dogs on 7/11/2006  The dogs finally reacted on 7/2/2006";
            TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false));
            TeeSinkTokenFilter.SinkTokenStream sink = tee.NewSinkTokenStream(sinkFilter);
            int count = 0;

            tee.Reset();
            while (tee.IncrementToken())
            {
                count++;
            }
            assertTrue(count + " does not equal: " + 18, count == 18);

            int sinkCount = 0;
            sink.Reset();
            while (sink.IncrementToken())
            {
                sinkCount++;
            }
            assertTrue("sink Size: " + sinkCount + " is not: " + 2, sinkCount == 2);

        }
        public virtual void Test()
        {
            TokenRangeSinkFilter sinkFilter = new TokenRangeSinkFilter(2, 4);
            string test = "The quick red fox jumped over the lazy brown dogs";
            TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false));
            TeeSinkTokenFilter.SinkTokenStream rangeToks = tee.NewSinkTokenStream(sinkFilter);

            int count = 0;
            tee.Reset();
            while (tee.IncrementToken())
            {
                count++;
            }

            int sinkCount = 0;
            rangeToks.Reset();
            while (rangeToks.IncrementToken())
            {
                sinkCount++;
            }

            assertTrue(count + " does not equal: " + 10, count == 10);
            assertTrue("rangeToks Size: " + sinkCount + " is not: " + 2, sinkCount == 2);
        }
Ejemplo n.º 10
0
        /// <summary>
        /// Not an explicit test, just useful to print out some info on performance
        /// </summary>
        public virtual void Performance()
        {
            int[] tokCount  = new int[] { 100, 500, 1000, 2000, 5000, 10000 };
            int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 };
            for (int k = 0; k < tokCount.Length; k++)
            {
                StringBuilder buffer = new StringBuilder();
                Console.WriteLine("-----Tokens: " + tokCount[k] + "-----");
                for (int i = 0; i < tokCount[k]; i++)
                {
                    //buffer.Append(English.intToEnglish(i).toUpperCase(Locale.ROOT)).Append(' ');
                    buffer.Append(i.ToString(CultureInfo.InvariantCulture)).Append(' ');
                }
                //make sure we produce the same tokens
                TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))));
                TokenStream        sink      = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, 100));
                teeStream.ConsumeAllTokens();
                TokenStream        stream  = new ModuloTokenFilter(this, new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), 100);
                ICharTermAttribute tfTok   = stream.AddAttribute <ICharTermAttribute>();
                ICharTermAttribute sinkTok = sink.AddAttribute <ICharTermAttribute>();
                for (int i = 0; stream.IncrementToken(); i++)
                {
                    assertTrue(sink.IncrementToken());
                    assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.Equals(sinkTok) == true);
                }

                //simulate two fields, each being analyzed once, for 20 documents
                for (int j = 0; j < modCounts.Length; j++)
                {
                    int tfPos = 0;
                    //long start = DateTimeHelperClass.CurrentUnixTimeMillis();
                    long start = Environment.TickCount;
                    for (int i = 0; i < 20; i++)
                    {
                        stream = new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString())));
                        IPositionIncrementAttribute posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>();
                        while (stream.IncrementToken())
                        {
                            tfPos += posIncrAtt.PositionIncrement;
                        }
                        stream     = new ModuloTokenFilter(this, new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), modCounts[j]);
                        posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>();
                        while (stream.IncrementToken())
                        {
                            tfPos += posIncrAtt.PositionIncrement;
                        }
                    }
                    //long finish = DateTimeHelperClass.CurrentUnixTimeMillis();
                    long finish = Environment.TickCount;
                    Console.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
                    int sinkPos = 0;
                    //simulate one field with one sink
                    //start = DateTimeHelperClass.CurrentUnixTimeMillis();
                    start = Environment.TickCount;
                    for (int i = 0; i < 20; i++)
                    {
                        teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))));
                        sink      = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, modCounts[j]));
                        IPositionIncrementAttribute posIncrAtt = teeStream.GetAttribute <IPositionIncrementAttribute>();
                        while (teeStream.IncrementToken())
                        {
                            sinkPos += posIncrAtt.PositionIncrement;
                        }
                        //System.out.println("Modulo--------");
                        posIncrAtt = sink.GetAttribute <IPositionIncrementAttribute>();
                        while (sink.IncrementToken())
                        {
                            sinkPos += posIncrAtt.PositionIncrement;
                        }
                    }
                    //finish = DateTimeHelperClass.CurrentUnixTimeMillis();
                    finish = Environment.TickCount;
                    Console.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
                    assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos);
                }
                Console.WriteLine("- End Tokens: " + tokCount[k] + "-----");
            }
        }