Exemplo n.º 1
0
        public virtual void TestMultipleSources()
        {
            TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())));

            TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.NewSinkTokenStream(dogFilter);
            TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.NewSinkTokenStream(theFilter);
            TokenStream source1 = new CachingTokenFilter(tee1);


            tee1.AddAttribute <ICheckClearAttributesAttribute>();
            dogDetector.AddAttribute <ICheckClearAttributesAttribute>();
            theDetector.AddAttribute <ICheckClearAttributesAttribute>();


            TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer2.ToString())));

            tee2.AddSinkTokenStream(dogDetector);
            tee2.AddSinkTokenStream(theDetector);
            TokenStream source2 = tee2;

            AssertTokenStreamContents(source1, tokens1);
            AssertTokenStreamContents(source2, tokens2);

            AssertTokenStreamContents(theDetector, new String[] { "The", "the", "The", "the" });

            source1.Reset();
            TokenStream lowerCasing = new LowerCaseFilter(source1);

            String[] lowerCaseTokens = new String[tokens1.Length];
            for (int i = 0; i < tokens1.Length; i++)
            {
                lowerCaseTokens[i] = tokens1[i].ToLower();
            }
        }
Exemplo n.º 2
0
        public virtual void TestGeneral()
        {
            TeeSinkTokenFilter source = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())));
            TokenStream        sink1  = source.NewSinkTokenStream();
            TokenStream        sink2  = source.NewSinkTokenStream(theFilter);

            source.AddAttribute <ICheckClearAttributesAttribute>();
            sink1.AddAttribute <ICheckClearAttributesAttribute>();
            sink2.AddAttribute <ICheckClearAttributesAttribute>();

            AssertTokenStreamContents(source, tokens1);
            AssertTokenStreamContents(sink1, tokens1);
        }
		public virtual void  TestGeneral()
		{
			TeeSinkTokenFilter source = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())));
			TokenStream sink1 = source.NewSinkTokenStream();
			TokenStream sink2 = source.NewSinkTokenStream(theFilter);

            source.AddAttribute(typeof(CheckClearAttributesAttribute));
            sink1.AddAttribute(typeof(CheckClearAttributesAttribute));
            sink2.AddAttribute(typeof(CheckClearAttributesAttribute));
    
            AssertTokenStreamContents(source, tokens1);
            AssertTokenStreamContents(sink1, tokens1);
		}
Exemplo n.º 4
0
        /// <summary> Not an explicit test, just useful to print out some info on performance
        ///
        /// </summary>
        /// <throws>  Exception </throws>
        public virtual void Performance()
        {
            int[] tokCount  = new int[] { 100, 500, 1000, 2000, 5000, 10000 };
            int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 };
            for (int k = 0; k < tokCount.Length; k++)
            {
                System.Text.StringBuilder buffer = new System.Text.StringBuilder();
                System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----");
                for (int i = 0; i < tokCount[k]; i++)
                {
                    buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' ');
                }
                //make sure we produce the same tokens
                TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))));
                TokenStream        sink      = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, 100));
                teeStream.ConsumeAllTokens();
                TokenStream    stream  = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))), 100);
                ITermAttribute tfTok   = stream.AddAttribute <ITermAttribute>();
                ITermAttribute sinkTok = sink.AddAttribute <ITermAttribute>();
                for (int i = 0; stream.IncrementToken(); i++)
                {
                    Assert.IsTrue(sink.IncrementToken());
                    Assert.IsTrue(tfTok.Equals(sinkTok) == true, tfTok + " is not equal to " + sinkTok + " at token: " + i);
                }

                //simulate two fields, each being analyzed once, for 20 documents
                for (int j = 0; j < modCounts.Length; j++)
                {
                    int  tfPos = 0;
                    long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    for (int i = 0; i < 20; i++)
                    {
                        stream = new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString())));
                        IPositionIncrementAttribute posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>();
                        while (stream.IncrementToken())
                        {
                            tfPos += posIncrAtt.PositionIncrement;
                        }
                        stream     = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))), modCounts[j]);
                        posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>();
                        while (stream.IncrementToken())
                        {
                            tfPos += posIncrAtt.PositionIncrement;
                        }
                    }
                    long finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
                    int sinkPos = 0;
                    //simulate one field with one sink
                    start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    for (int i = 0; i < 20; i++)
                    {
                        teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))));
                        sink      = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, modCounts[j]));
                        IPositionIncrementAttribute posIncrAtt = teeStream.GetAttribute <IPositionIncrementAttribute>();
                        while (teeStream.IncrementToken())
                        {
                            sinkPos += posIncrAtt.PositionIncrement;
                        }
                        //System.out.println("Modulo--------");
                        posIncrAtt = sink.GetAttribute <IPositionIncrementAttribute>();
                        while (sink.IncrementToken())
                        {
                            sinkPos += posIncrAtt.PositionIncrement;
                        }
                    }
                    finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
                    Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos);
                }
                System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----");
            }
        }
		public virtual void  TestMultipleSources()
		{
			TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())));
			TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.NewSinkTokenStream(dogFilter);
			TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.NewSinkTokenStream(theFilter);
			TokenStream source1 = new CachingTokenFilter(tee1);
			
             
            tee1.AddAttribute(typeof(CheckClearAttributesAttribute));
            dogDetector.AddAttribute(typeof(CheckClearAttributesAttribute));
            theDetector.AddAttribute(typeof(CheckClearAttributesAttribute));


			TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer2.ToString())));
			tee2.AddSinkTokenStream(dogDetector);
			tee2.AddSinkTokenStream(theDetector);
			TokenStream source2 = tee2;

            AssertTokenStreamContents(source1, tokens1);
            AssertTokenStreamContents(source2, tokens2);

            AssertTokenStreamContents(theDetector, new String[] { "The", "the", "The", "the" });
            			
			source1.Reset();
			TokenStream lowerCasing = new LowerCaseFilter(source1);
            String[] lowerCaseTokens = new String[tokens1.Length];
            for (int i = 0; i < tokens1.Length; i++)
                lowerCaseTokens[i] = tokens1[i].ToLower();

		}
		/// <summary> Not an explicit test, just useful to print out some info on performance
		/// 
		/// </summary>
		/// <throws>  Exception </throws>
		public virtual void  Performance()
		{
			int[] tokCount = new int[]{100, 500, 1000, 2000, 5000, 10000};
			int[] modCounts = new int[]{1, 2, 5, 10, 20, 50, 100, 200, 500};
			for (int k = 0; k < tokCount.Length; k++)
			{
				System.Text.StringBuilder buffer = new System.Text.StringBuilder();
				System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----");
				for (int i = 0; i < tokCount[k]; i++)
				{
					buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' ');
				}
				//make sure we produce the same tokens
				TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))));
				TokenStream sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, 100));
				teeStream.ConsumeAllTokens();
				TokenStream stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), 100);
				TermAttribute tfTok = (TermAttribute) stream.AddAttribute(typeof(TermAttribute));
				TermAttribute sinkTok = (TermAttribute) sink.AddAttribute(typeof(TermAttribute));
				for (int i = 0; stream.IncrementToken(); i++)
				{
					Assert.IsTrue(sink.IncrementToken());
					Assert.IsTrue(tfTok.Equals(sinkTok) == true, tfTok + " is not equal to " + sinkTok + " at token: " + i);
				}
				
				//simulate two fields, each being analyzed once, for 20 documents
				for (int j = 0; j < modCounts.Length; j++)
				{
					int tfPos = 0;
					long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
					for (int i = 0; i < 20; i++)
					{
						stream = new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString())));
						PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.GetAttribute(typeof(PositionIncrementAttribute));
						while (stream.IncrementToken())
						{
							tfPos += posIncrAtt.GetPositionIncrement();
						}
						stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), modCounts[j]);
						posIncrAtt = (PositionIncrementAttribute) stream.GetAttribute(typeof(PositionIncrementAttribute));
						while (stream.IncrementToken())
						{
							tfPos += posIncrAtt.GetPositionIncrement();
						}
					}
					long finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
					System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
					int sinkPos = 0;
					//simulate one field with one sink
					start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
					for (int i = 0; i < 20; i++)
					{
						teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))));
						sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, modCounts[j]));
						PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) teeStream.GetAttribute(typeof(PositionIncrementAttribute));
						while (teeStream.IncrementToken())
						{
							sinkPos += posIncrAtt.GetPositionIncrement();
						}
						//System.out.println("Modulo--------");
						posIncrAtt = (PositionIncrementAttribute) sink.GetAttribute(typeof(PositionIncrementAttribute));
						while (sink.IncrementToken())
						{
							sinkPos += posIncrAtt.GetPositionIncrement();
						}
					}
					finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
					System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
                    Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos);
				}
				System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----");
			}
		}