private void  TestTeeSinkCustomToken(int api)
        {
            TokenStream stream = new WhitespaceTokenizer(new System.IO.StringReader(doc));

            stream = new PartOfSpeechTaggingFilter(stream);
            stream = new LowerCaseFilter(stream);
            stream = new StopFilter(stream, stopwords);

            SinkTokenizer sink    = new SinkTokenizer();
            TokenStream   stream1 = new PartOfSpeechAnnotatingFilter(sink);

            stream = new TeeTokenFilter(stream, sink);
            stream = new PartOfSpeechAnnotatingFilter(stream);

            switch (api)
            {
            case 0:
                ConsumeStreamNewAPI(stream);
                ConsumeStreamNewAPI(stream1);
                break;

            case 1:
                ConsumeStreamOldAPI(stream);
                ConsumeStreamOldAPI(stream1);
                break;

            case 2:
                ConsumeStreamVeryOldAPI(stream);
                ConsumeStreamVeryOldAPI(stream1);
                break;
            }
        }
Example #2
0
        public virtual void  TestMultipleSources()
        {
            SinkTokenizer theDetector   = new AnonymousClassSinkTokenizer1(this, null);
            SinkTokenizer dogDetector   = new AnonymousClassSinkTokenizer2(this, null);
            TokenStream   source1       = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())), theDetector), dogDetector));
            TokenStream   source2       = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer2.ToString())), theDetector), dogDetector);
            int           i             = 0;
            Token         reusableToken = new Token();

            for (Token nextToken = source1.Next(reusableToken); nextToken != null; nextToken = source1.Next(reusableToken))
            {
                Assert.IsTrue(nextToken.Term().Equals(tokens1[i]) == true, nextToken.Term() + " is not equal to " + tokens1[i]);
                i++;
            }
            Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length);
            Assert.IsTrue(theDetector.GetTokens().Count == 2, "theDetector Size: " + theDetector.GetTokens().Count + " is not: " + 2);
            Assert.IsTrue(dogDetector.GetTokens().Count == 1, "dogDetector Size: " + dogDetector.GetTokens().Count + " is not: " + 1);
            i = 0;
            for (Token nextToken = source2.Next(reusableToken); nextToken != null; nextToken = source2.Next(reusableToken))
            {
                Assert.IsTrue(nextToken.Term().Equals(tokens2[i]) == true, nextToken.Term() + " is not equal to " + tokens2[i]);
                i++;
            }
            Assert.IsTrue(i == tokens2.Length, i + " does not equal: " + tokens2.Length);
            Assert.IsTrue(theDetector.GetTokens().Count == 4, "theDetector Size: " + theDetector.GetTokens().Count + " is not: " + 4);
            Assert.IsTrue(dogDetector.GetTokens().Count == 2, "dogDetector Size: " + dogDetector.GetTokens().Count + " is not: " + 2);
            i = 0;
            for (Token nextToken = theDetector.Next(reusableToken); nextToken != null; nextToken = theDetector.Next(reusableToken))
            {
                Assert.IsTrue(nextToken.Term().ToUpper().Equals("The".ToUpper()) == true, nextToken.Term() + " is not equal to " + "The");
                i++;
            }
            Assert.IsTrue(i == theDetector.GetTokens().Count, i + " does not equal: " + theDetector.GetTokens().Count);
            i = 0;
            for (Token nextToken = dogDetector.Next(reusableToken); nextToken != null; nextToken = dogDetector.Next(reusableToken))
            {
                Assert.IsTrue(nextToken.Term().ToUpper().Equals("Dogs".ToUpper()) == true, nextToken.Term() + " is not equal to " + "Dogs");
                i++;
            }
            Assert.IsTrue(i == dogDetector.GetTokens().Count, i + " does not equal: " + dogDetector.GetTokens().Count);
            source1.Reset();
            TokenStream lowerCasing = new LowerCaseFilter(source1);

            i = 0;
            for (Token nextToken = lowerCasing.Next(reusableToken); nextToken != null; nextToken = lowerCasing.Next(reusableToken))
            {
                Assert.IsTrue(nextToken.Term().Equals(tokens1[i].ToLower()) == true, nextToken.Term() + " is not equal to " + tokens1[i].ToLower());
                i++;
            }
            Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length);
        }
Example #3
0
        public virtual void  Test()
        {
            SinkTokenizer sink1         = new AnonymousClassSinkTokenizer(this, null);
            TokenStream   source        = new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())), sink1);
            int           i             = 0;
            Token         reusableToken = new Token();

            for (Token nextToken = source.Next(reusableToken); nextToken != null; nextToken = source.Next(reusableToken))
            {
                Assert.IsTrue(nextToken.Term().Equals(tokens1[i]) == true, nextToken.Term() + " is not equal to " + tokens1[i]);
                i++;
            }
            Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length);
            Assert.IsTrue(sink1.GetTokens().Count == 2, "sink1 Size: " + sink1.GetTokens().Count + " is not: " + 2);
            i = 0;
            for (Token token = sink1.Next(reusableToken); token != null; token = sink1.Next(reusableToken))
            {
                Assert.IsTrue(token.Term().ToUpper().Equals("The".ToUpper()) == true, token.Term() + " is not equal to " + "The");
                i++;
            }
            Assert.IsTrue(i == sink1.GetTokens().Count, i + " does not equal: " + sink1.GetTokens().Count);
        }
Example #4
0
		/// <summary> Not an explicit test, just useful to print out some info on performance
		/// 
		/// </summary>
		/// <throws>  Exception </throws>
		public virtual void  Performance()
		{
			int[] tokCount = new int[]{100, 500, 1000, 2000, 5000, 10000};
			int[] modCounts = new int[]{1, 2, 5, 10, 20, 50, 100, 200, 500};
			for (int k = 0; k < tokCount.Length; k++)
			{
				System.Text.StringBuilder buffer = new System.Text.StringBuilder();
				System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----");
				for (int i = 0; i < tokCount[k]; i++)
				{
					buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' ');
				}
				//make sure we produce the same tokens
				ModuloSinkTokenizer sink = new ModuloSinkTokenizer(this, tokCount[k], 100);
				Token reusableToken = new Token();
				TokenStream stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), sink);
				while (stream.Next(reusableToken) != null)
				{
				}
				stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), 100);
				System.Collections.IList tmp = new System.Collections.ArrayList();
				for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
				{
					tmp.Add(nextToken.Clone());
				}
				System.Collections.IList sinkList = sink.GetTokens();
				Assert.IsTrue(tmp.Count == sinkList.Count, "tmp Size: " + tmp.Count + " is not: " + sinkList.Count);
				for (int i = 0; i < tmp.Count; i++)
				{
					Token tfTok = (Token) tmp[i];
					Token sinkTok = (Token) sinkList[i];
					Assert.IsTrue(tfTok.Term().Equals(sinkTok.Term()) == true, tfTok.Term() + " is not equal to " + sinkTok.Term() + " at token: " + i);
				}
				//simulate two fields, each being analyzed once, for 20 documents
				
				for (int j = 0; j < modCounts.Length; j++)
				{
					int tfPos = 0;
					long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
					for (int i = 0; i < 20; i++)
					{
						stream = new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString())));
						for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
						{
							tfPos += nextToken.GetPositionIncrement();
						}
						stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), modCounts[j]);
						for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
						{
							tfPos += nextToken.GetPositionIncrement();
						}
					}
					long finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
					System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
					int sinkPos = 0;
					//simulate one field with one sink
					start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
					for (int i = 0; i < 20; i++)
					{
						sink = new ModuloSinkTokenizer(this, tokCount[k], modCounts[j]);
						stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), sink);
						for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
						{
							sinkPos += nextToken.GetPositionIncrement();
						}
						//System.out.println("Modulo--------");
						stream = sink;
						for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
						{
							sinkPos += nextToken.GetPositionIncrement();
						}
					}
					finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
					System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
					Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos);
				}
				System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----");
			}
		}
Example #5
0
		public virtual void  TestMultipleSources()
		{
			SinkTokenizer theDetector = new AnonymousClassSinkTokenizer1(this, null);
			SinkTokenizer dogDetector = new AnonymousClassSinkTokenizer2(this, null);
			TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())), theDetector), dogDetector));
			TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer2.ToString())), theDetector), dogDetector);
			int i = 0;
			Token reusableToken = new Token();
			for (Token nextToken = source1.Next(reusableToken); nextToken != null; nextToken = source1.Next(reusableToken))
			{
				Assert.IsTrue(nextToken.Term().Equals(tokens1[i]) == true, nextToken.Term() + " is not equal to " + tokens1[i]);
				i++;
			}
			Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length);
			Assert.IsTrue(theDetector.GetTokens().Count == 2, "theDetector Size: " + theDetector.GetTokens().Count + " is not: " + 2);
			Assert.IsTrue(dogDetector.GetTokens().Count == 1, "dogDetector Size: " + dogDetector.GetTokens().Count + " is not: " + 1);
			i = 0;
			for (Token nextToken = source2.Next(reusableToken); nextToken != null; nextToken = source2.Next(reusableToken))
			{
				Assert.IsTrue(nextToken.Term().Equals(tokens2[i]) == true, nextToken.Term() + " is not equal to " + tokens2[i]);
				i++;
			}
			Assert.IsTrue(i == tokens2.Length, i + " does not equal: " + tokens2.Length);
			Assert.IsTrue(theDetector.GetTokens().Count == 4, "theDetector Size: " + theDetector.GetTokens().Count + " is not: " + 4);
			Assert.IsTrue(dogDetector.GetTokens().Count == 2, "dogDetector Size: " + dogDetector.GetTokens().Count + " is not: " + 2);
			i = 0;
			for (Token nextToken = theDetector.Next(reusableToken); nextToken != null; nextToken = theDetector.Next(reusableToken))
			{
				Assert.IsTrue(nextToken.Term().ToUpper().Equals("The".ToUpper()) == true, nextToken.Term() + " is not equal to " + "The");
				i++;
			}
			Assert.IsTrue(i == theDetector.GetTokens().Count, i + " does not equal: " + theDetector.GetTokens().Count);
			i = 0;
			for (Token nextToken = dogDetector.Next(reusableToken); nextToken != null; nextToken = dogDetector.Next(reusableToken))
			{
				Assert.IsTrue(nextToken.Term().ToUpper().Equals("Dogs".ToUpper()) == true, nextToken.Term() + " is not equal to " + "Dogs");
				i++;
			}
			Assert.IsTrue(i == dogDetector.GetTokens().Count, i + " does not equal: " + dogDetector.GetTokens().Count);
			source1.Reset();
			TokenStream lowerCasing = new LowerCaseFilter(source1);
			i = 0;
			for (Token nextToken = lowerCasing.Next(reusableToken); nextToken != null; nextToken = lowerCasing.Next(reusableToken))
			{
				Assert.IsTrue(nextToken.Term().Equals(tokens1[i].ToLower()) == true, nextToken.Term() + " is not equal to " + tokens1[i].ToLower());
				i++;
			}
			Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length);
		}
Example #6
0
		public virtual void  Test()
		{
			
			SinkTokenizer sink1 = new AnonymousClassSinkTokenizer(this, null);
			TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())), sink1);
			int i = 0;
			Token reusableToken = new Token();
			for (Token nextToken = source.Next(reusableToken); nextToken != null; nextToken = source.Next(reusableToken))
			{
				Assert.IsTrue(nextToken.Term().Equals(tokens1[i]) == true, nextToken.Term() + " is not equal to " + tokens1[i]);
				i++;
			}
			Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length);
			Assert.IsTrue(sink1.GetTokens().Count == 2, "sink1 Size: " + sink1.GetTokens().Count + " is not: " + 2);
			i = 0;
			for (Token token = sink1.Next(reusableToken); token != null; token = sink1.Next(reusableToken))
			{
				Assert.IsTrue(token.Term().ToUpper().Equals("The".ToUpper()) == true, token.Term() + " is not equal to " + "The");
				i++;
			}
			Assert.IsTrue(i == sink1.GetTokens().Count, i + " does not equal: " + sink1.GetTokens().Count);
		}
		private void  TestTeeSinkCustomToken(int api)
		{
			TokenStream stream = new WhitespaceTokenizer(new System.IO.StringReader(doc));
			stream = new PartOfSpeechTaggingFilter(stream);
			stream = new LowerCaseFilter(stream);
			stream = new StopFilter(stream, stopwords);
			
			SinkTokenizer sink = new SinkTokenizer();
			TokenStream stream1 = new PartOfSpeechAnnotatingFilter(sink);
			
			stream = new TeeTokenFilter(stream, sink);
			stream = new PartOfSpeechAnnotatingFilter(stream);
			
			switch (api)
			{
				
				case 0: 
					ConsumeStreamNewAPI(stream);
					ConsumeStreamNewAPI(stream1);
					break;
				
				case 1: 
					ConsumeStreamOldAPI(stream);
					ConsumeStreamOldAPI(stream1);
					break;
				
				case 2: 
					ConsumeStreamVeryOldAPI(stream);
					ConsumeStreamVeryOldAPI(stream1);
					break;
				}
		}
Example #8
0
        /// <summary> Not an explicit test, just useful to print out some info on performance
        ///
        /// </summary>
        /// <throws>  Exception </throws>
        public virtual void  Performance()
        {
            int[] tokCount  = new int[] { 100, 500, 1000, 2000, 5000, 10000 };
            int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 };
            for (int k = 0; k < tokCount.Length; k++)
            {
                System.Text.StringBuilder buffer = new System.Text.StringBuilder();
                System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----");
                for (int i = 0; i < tokCount[k]; i++)
                {
                    buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' ');
                }
                //make sure we produce the same tokens
                ModuloSinkTokenizer sink  = new ModuloSinkTokenizer(this, tokCount[k], 100);
                Token       reusableToken = new Token();
                TokenStream stream        = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), sink);
                while (stream.Next(reusableToken) != null)
                {
                }
                stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), 100);
                System.Collections.IList tmp = new System.Collections.ArrayList();
                for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
                {
                    tmp.Add(nextToken.Clone());
                }
                System.Collections.IList sinkList = sink.GetTokens();
                Assert.IsTrue(tmp.Count == sinkList.Count, "tmp Size: " + tmp.Count + " is not: " + sinkList.Count);
                for (int i = 0; i < tmp.Count; i++)
                {
                    Token tfTok   = (Token)tmp[i];
                    Token sinkTok = (Token)sinkList[i];
                    Assert.IsTrue(tfTok.Term().Equals(sinkTok.Term()) == true, tfTok.Term() + " is not equal to " + sinkTok.Term() + " at token: " + i);
                }
                //simulate two fields, each being analyzed once, for 20 documents

                for (int j = 0; j < modCounts.Length; j++)
                {
                    int  tfPos = 0;
                    long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    for (int i = 0; i < 20; i++)
                    {
                        stream = new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString())));
                        for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
                        {
                            tfPos += nextToken.GetPositionIncrement();
                        }
                        stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), modCounts[j]);
                        for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
                        {
                            tfPos += nextToken.GetPositionIncrement();
                        }
                    }
                    long finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
                    int sinkPos = 0;
                    //simulate one field with one sink
                    start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    for (int i = 0; i < 20; i++)
                    {
                        sink   = new ModuloSinkTokenizer(this, tokCount[k], modCounts[j]);
                        stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), sink);
                        for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
                        {
                            sinkPos += nextToken.GetPositionIncrement();
                        }
                        //System.out.println("Modulo--------");
                        stream = sink;
                        for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
                        {
                            sinkPos += nextToken.GetPositionIncrement();
                        }
                    }
                    finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
                    Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos);
                }
                System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----");
            }
        }