Пример #1
0
        public virtual void  TestMultipleSources()
        {
            SinkTokenizer theDetector   = new AnonymousClassSinkTokenizer1(this, null);
            SinkTokenizer dogDetector   = new AnonymousClassSinkTokenizer2(this, null);
            TokenStream   source1       = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())), theDetector), dogDetector));
            TokenStream   source2       = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer2.ToString())), theDetector), dogDetector);
            int           i             = 0;
            Token         reusableToken = new Token();

            for (Token nextToken = source1.Next(reusableToken); nextToken != null; nextToken = source1.Next(reusableToken))
            {
                Assert.IsTrue(nextToken.Term().Equals(tokens1[i]) == true, nextToken.Term() + " is not equal to " + tokens1[i]);
                i++;
            }
            Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length);
            Assert.IsTrue(theDetector.GetTokens().Count == 2, "theDetector Size: " + theDetector.GetTokens().Count + " is not: " + 2);
            Assert.IsTrue(dogDetector.GetTokens().Count == 1, "dogDetector Size: " + dogDetector.GetTokens().Count + " is not: " + 1);
            i = 0;
            for (Token nextToken = source2.Next(reusableToken); nextToken != null; nextToken = source2.Next(reusableToken))
            {
                Assert.IsTrue(nextToken.Term().Equals(tokens2[i]) == true, nextToken.Term() + " is not equal to " + tokens2[i]);
                i++;
            }
            Assert.IsTrue(i == tokens2.Length, i + " does not equal: " + tokens2.Length);
            Assert.IsTrue(theDetector.GetTokens().Count == 4, "theDetector Size: " + theDetector.GetTokens().Count + " is not: " + 4);
            Assert.IsTrue(dogDetector.GetTokens().Count == 2, "dogDetector Size: " + dogDetector.GetTokens().Count + " is not: " + 2);
            i = 0;
            for (Token nextToken = theDetector.Next(reusableToken); nextToken != null; nextToken = theDetector.Next(reusableToken))
            {
                Assert.IsTrue(nextToken.Term().ToUpper().Equals("The".ToUpper()) == true, nextToken.Term() + " is not equal to " + "The");
                i++;
            }
            Assert.IsTrue(i == theDetector.GetTokens().Count, i + " does not equal: " + theDetector.GetTokens().Count);
            i = 0;
            for (Token nextToken = dogDetector.Next(reusableToken); nextToken != null; nextToken = dogDetector.Next(reusableToken))
            {
                Assert.IsTrue(nextToken.Term().ToUpper().Equals("Dogs".ToUpper()) == true, nextToken.Term() + " is not equal to " + "Dogs");
                i++;
            }
            Assert.IsTrue(i == dogDetector.GetTokens().Count, i + " does not equal: " + dogDetector.GetTokens().Count);
            source1.Reset();
            TokenStream lowerCasing = new LowerCaseFilter(source1);

            i = 0;
            for (Token nextToken = lowerCasing.Next(reusableToken); nextToken != null; nextToken = lowerCasing.Next(reusableToken))
            {
                Assert.IsTrue(nextToken.Term().Equals(tokens1[i].ToLower()) == true, nextToken.Term() + " is not equal to " + tokens1[i].ToLower());
                i++;
            }
            Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length);
        }
Пример #2
0
 public override void  Add(Token t)
 {
     if (t != null && t.Term().ToUpper().Equals("The".ToUpper()))
     {
         base.Add(t);
     }
 }
Пример #3
0
		public virtual void  TestCtor()
		{
			Token t = new Token();
			char[] content = "hello".ToCharArray();
			t.SetTermBuffer(content, 0, content.Length);
			char[] buf = t.TermBuffer();
			Assert.AreNotEqual(t.TermBuffer(), content);
			Assert.AreEqual("hello", t.Term());
			Assert.AreEqual("word", t.Type());
			Assert.AreEqual(0, t.GetFlags());
			
			t = new Token(6, 22);
			t.SetTermBuffer(content, 0, content.Length);
			Assert.AreEqual("hello", t.Term());
			Assert.AreEqual("(hello,6,22)", t.ToString());
			Assert.AreEqual("word", t.Type());
			Assert.AreEqual(0, t.GetFlags());
			
			t = new Token(6, 22, 7);
			t.SetTermBuffer(content, 0, content.Length);
			Assert.AreEqual("hello", t.Term());
			Assert.AreEqual("(hello,6,22)", t.ToString());
			Assert.AreEqual(7, t.GetFlags());
			
			t = new Token(6, 22, "junk");
			t.SetTermBuffer(content, 0, content.Length);
			Assert.AreEqual("hello", t.Term());
			Assert.AreEqual("(hello,6,22,type=junk)", t.ToString());
			Assert.AreEqual(0, t.GetFlags());
		}
Пример #4
0
        public virtual void  TestCtor()
        {
            Token t = new Token();

            char[] content = "hello".ToCharArray();
            t.SetTermBuffer(content, 0, content.Length);
            char[] buf = t.TermBuffer();
            Assert.AreNotEqual(t.TermBuffer(), content);
            Assert.AreEqual("hello", t.Term());
            Assert.AreEqual("word", t.Type());
            Assert.AreEqual(0, t.GetFlags());

            t = new Token(6, 22);
            t.SetTermBuffer(content, 0, content.Length);
            Assert.AreEqual("hello", t.Term());
            Assert.AreEqual("(hello,6,22)", t.ToString());
            Assert.AreEqual("word", t.Type());
            Assert.AreEqual(0, t.GetFlags());

            t = new Token(6, 22, 7);
            t.SetTermBuffer(content, 0, content.Length);
            Assert.AreEqual("hello", t.Term());
            Assert.AreEqual("(hello,6,22)", t.ToString());
            Assert.AreEqual(7, t.GetFlags());

            t = new Token(6, 22, "junk");
            t.SetTermBuffer(content, 0, content.Length);
            Assert.AreEqual("hello", t.Term());
            Assert.AreEqual("(hello,6,22,type=junk)", t.ToString());
            Assert.AreEqual(0, t.GetFlags());
        }
Пример #5
0
			public override void  Add(Token t)
			{
				if (t != null && t.Term().ToUpper().Equals("The".ToUpper()))
				{
					base.Add(t);
				}
			}
Пример #6
0
        public virtual void  TestMixedStringArray()
        {
            Token t = new Token("hello", 0, 5);

            Assert.AreEqual(t.TermText(), "hello");
            Assert.AreEqual(t.TermLength(), 5);
            Assert.AreEqual(t.Term(), "hello");
            t.SetTermText("hello2");
            Assert.AreEqual(t.TermLength(), 6);
            Assert.AreEqual(t.Term(), "hello2");
            t.SetTermBuffer("hello3".ToCharArray(), 0, 6);
            Assert.AreEqual(t.TermText(), "hello3");

            // Make sure if we get the buffer and change a character
            // that termText() reflects the change
            char[] buffer = t.TermBuffer();
            buffer[1] = 'o';
            Assert.AreEqual(t.TermText(), "hollo3");
        }
Пример #7
0
		public virtual void  TestResize()
		{
			Token t = new Token();
			char[] content = "hello".ToCharArray();
			t.SetTermBuffer(content, 0, content.Length);
			for (int i = 0; i < 2000; i++)
			{
				t.ResizeTermBuffer(i);
				Assert.IsTrue(i <= t.TermBuffer().Length);
				Assert.AreEqual("hello", t.Term());
			}
		}
Пример #8
0
        public virtual void  TestResize()
        {
            Token t = new Token();

            char[] content = "hello".ToCharArray();
            t.SetTermBuffer(content, 0, content.Length);
            for (int i = 0; i < 2000; i++)
            {
                t.ResizeTermBuffer(i);
                Assert.IsTrue(i <= t.TermBuffer().Length);
                Assert.AreEqual("hello", t.Term());
            }
        }
Пример #9
0
        public virtual void  Test()
        {
            SinkTokenizer sink1         = new AnonymousClassSinkTokenizer(this, null);
            TokenStream   source        = new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())), sink1);
            int           i             = 0;
            Token         reusableToken = new Token();

            for (Token nextToken = source.Next(reusableToken); nextToken != null; nextToken = source.Next(reusableToken))
            {
                Assert.IsTrue(nextToken.Term().Equals(tokens1[i]) == true, nextToken.Term() + " is not equal to " + tokens1[i]);
                i++;
            }
            Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length);
            Assert.IsTrue(sink1.GetTokens().Count == 2, "sink1 Size: " + sink1.GetTokens().Count + " is not: " + 2);
            i = 0;
            for (Token token = sink1.Next(reusableToken); token != null; token = sink1.Next(reusableToken))
            {
                Assert.IsTrue(token.Term().ToUpper().Equals("The".ToUpper()) == true, token.Term() + " is not equal to " + "The");
                i++;
            }
            Assert.IsTrue(i == sink1.GetTokens().Count, i + " does not equal: " + sink1.GetTokens().Count);
        }
Пример #10
0
        public virtual void  TestCopyTo()
        {
            Token t    = new Token();
            Token copy = (Token)TestSimpleAttributeImpls.AssertCopyIsEqual(t);

            Assert.AreEqual("", t.Term());
            Assert.AreEqual("", copy.Term());

            t = new Token(0, 5);
            char[] content = "hello".ToCharArray();
            t.SetTermBuffer(content, 0, 5);
            char[] buf = t.TermBuffer();
            copy = (Token)TestSimpleAttributeImpls.AssertCopyIsEqual(t);
            Assert.AreEqual(t.Term(), copy.Term());
            Assert.AreNotSame(buf, copy.TermBuffer());

            Payload pl = new Payload(new byte[] { 1, 2, 3, 4 });

            t.SetPayload(pl);
            copy = (Token)TestSimpleAttributeImpls.AssertCopyIsEqual(t);
            Assert.AreEqual(pl, copy.GetPayload());
            Assert.AreNotSame(pl, copy.GetPayload());
        }
        private static void  ConsumeStreamOldAPI(TokenStream stream)
        {
            stream.Reset();
            Token reusableToken = new Token();

            int i = 0;

            while ((reusableToken = stream.Next(reusableToken)) != null)
            {
                System.String term = reusableToken.Term();
                Payload       p    = reusableToken.GetPayload();
                if (p != null && p.GetData().Length == 1 && p.GetData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION)
                {
                    Assert.IsTrue("tokenstream".Equals(term), "only TokenStream is a proper noun");
                }
                else
                {
                    Assert.IsFalse("tokenstream".Equals(term), "all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)");
                }
                Assert.AreEqual(results[i], term);
                i++;
            }
        }
Пример #12
0
		public virtual void  TestGrow()
		{
			Token t = new Token();
			System.Text.StringBuilder buf = new System.Text.StringBuilder("ab");
			for (int i = 0; i < 20; i++)
			{
				char[] content = buf.ToString().ToCharArray();
				t.SetTermBuffer(content, 0, content.Length);
				Assert.AreEqual(buf.Length, t.TermLength());
				Assert.AreEqual(buf.ToString(), t.Term());
				buf.Append(buf.ToString());
			}
			Assert.AreEqual(1048576, t.TermLength());
			Assert.AreEqual(1179654, t.TermBuffer().Length);
			
			// now as a string, first variant
			t = new Token();
			buf = new System.Text.StringBuilder("ab");
			for (int i = 0; i < 20; i++)
			{
				System.String content = buf.ToString();
				t.SetTermBuffer(content, 0, content.Length);
				Assert.AreEqual(content.Length, t.TermLength());
				Assert.AreEqual(content, t.Term());
				buf.Append(content);
			}
			Assert.AreEqual(1048576, t.TermLength());
			Assert.AreEqual(1179654, t.TermBuffer().Length);
			
			// now as a string, second variant
			t = new Token();
			buf = new System.Text.StringBuilder("ab");
			for (int i = 0; i < 20; i++)
			{
				System.String content = buf.ToString();
				t.SetTermBuffer(content);
				Assert.AreEqual(content.Length, t.TermLength());
				Assert.AreEqual(content, t.Term());
				buf.Append(content);
			}
			Assert.AreEqual(1048576, t.TermLength());
			Assert.AreEqual(1179654, t.TermBuffer().Length);
			
			// Test for slow growth to a long term
			t = new Token();
			buf = new System.Text.StringBuilder("a");
			for (int i = 0; i < 20000; i++)
			{
				System.String content = buf.ToString();
				t.SetTermBuffer(content);
				Assert.AreEqual(content.Length, t.TermLength());
				Assert.AreEqual(content, t.Term());
				buf.Append("a");
			}
			Assert.AreEqual(20000, t.TermLength());
			Assert.AreEqual(20167, t.TermBuffer().Length);
			
			// Test for slow growth to a long term
			t = new Token();
			buf = new System.Text.StringBuilder("a");
			for (int i = 0; i < 20000; i++)
			{
				System.String content = buf.ToString();
				t.SetTermBuffer(content);
				Assert.AreEqual(content.Length, t.TermLength());
				Assert.AreEqual(content, t.Term());
				buf.Append("a");
			}
			Assert.AreEqual(20000, t.TermLength());
			Assert.AreEqual(20167, t.TermBuffer().Length);
		}
Пример #13
0
		private static void  ConsumeStreamOldAPI(TokenStream stream)
		{
			stream.Reset();
			Token reusableToken = new Token();
			
			int i = 0;
			while ((reusableToken = stream.Next(reusableToken)) != null)
			{
				System.String term = reusableToken.Term();
				Payload p = reusableToken.GetPayload();
				if (p != null && p.GetData().Length == 1 && p.GetData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION)
				{
					Assert.IsTrue("tokenstream".Equals(term), "only TokenStream is a proper noun");
				}
				else
				{
					Assert.IsFalse("tokenstream".Equals(term), "all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)");
				}
				Assert.AreEqual(results[i], term);
				i++;
			}
		}
Пример #14
0
        // TermAttribute:

        public System.String Term()
        {
            return(delegate_Renamed.Term());
        }
Пример #15
0
        /// <summary> Not an explicit test, just useful to print out some info on performance
        ///
        /// </summary>
        /// <throws>  Exception </throws>
        public virtual void  Performance()
        {
            int[] tokCount  = new int[] { 100, 500, 1000, 2000, 5000, 10000 };
            int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 };
            for (int k = 0; k < tokCount.Length; k++)
            {
                System.Text.StringBuilder buffer = new System.Text.StringBuilder();
                System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----");
                for (int i = 0; i < tokCount[k]; i++)
                {
                    buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' ');
                }
                //make sure we produce the same tokens
                ModuloSinkTokenizer sink  = new ModuloSinkTokenizer(this, tokCount[k], 100);
                Token       reusableToken = new Token();
                TokenStream stream        = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), sink);
                while (stream.Next(reusableToken) != null)
                {
                }
                stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), 100);
                System.Collections.IList tmp = new System.Collections.ArrayList();
                for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
                {
                    tmp.Add(nextToken.Clone());
                }
                System.Collections.IList sinkList = sink.GetTokens();
                Assert.IsTrue(tmp.Count == sinkList.Count, "tmp Size: " + tmp.Count + " is not: " + sinkList.Count);
                for (int i = 0; i < tmp.Count; i++)
                {
                    Token tfTok   = (Token)tmp[i];
                    Token sinkTok = (Token)sinkList[i];
                    Assert.IsTrue(tfTok.Term().Equals(sinkTok.Term()) == true, tfTok.Term() + " is not equal to " + sinkTok.Term() + " at token: " + i);
                }
                //simulate two fields, each being analyzed once, for 20 documents

                for (int j = 0; j < modCounts.Length; j++)
                {
                    int  tfPos = 0;
                    long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    for (int i = 0; i < 20; i++)
                    {
                        stream = new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString())));
                        for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
                        {
                            tfPos += nextToken.GetPositionIncrement();
                        }
                        stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), modCounts[j]);
                        for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
                        {
                            tfPos += nextToken.GetPositionIncrement();
                        }
                    }
                    long finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
                    int sinkPos = 0;
                    //simulate one field with one sink
                    start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    for (int i = 0; i < 20; i++)
                    {
                        sink   = new ModuloSinkTokenizer(this, tokCount[k], modCounts[j]);
                        stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), sink);
                        for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
                        {
                            sinkPos += nextToken.GetPositionIncrement();
                        }
                        //System.out.println("Modulo--------");
                        stream = sink;
                        for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken))
                        {
                            sinkPos += nextToken.GetPositionIncrement();
                        }
                    }
                    finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                    System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
                    Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos);
                }
                System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----");
            }
        }
Пример #16
0
        public virtual void  TestGrow()
        {
            Token t = new Token();

            System.Text.StringBuilder buf = new System.Text.StringBuilder("ab");
            for (int i = 0; i < 20; i++)
            {
                char[] content = buf.ToString().ToCharArray();
                t.SetTermBuffer(content, 0, content.Length);
                Assert.AreEqual(buf.Length, t.TermLength());
                Assert.AreEqual(buf.ToString(), t.Term());
                buf.Append(buf.ToString());
            }
            Assert.AreEqual(1048576, t.TermLength());
            Assert.AreEqual(1179654, t.TermBuffer().Length);

            // now as a string, first variant
            t   = new Token();
            buf = new System.Text.StringBuilder("ab");
            for (int i = 0; i < 20; i++)
            {
                System.String content = buf.ToString();
                t.SetTermBuffer(content, 0, content.Length);
                Assert.AreEqual(content.Length, t.TermLength());
                Assert.AreEqual(content, t.Term());
                buf.Append(content);
            }
            Assert.AreEqual(1048576, t.TermLength());
            Assert.AreEqual(1179654, t.TermBuffer().Length);

            // now as a string, second variant
            t   = new Token();
            buf = new System.Text.StringBuilder("ab");
            for (int i = 0; i < 20; i++)
            {
                System.String content = buf.ToString();
                t.SetTermBuffer(content);
                Assert.AreEqual(content.Length, t.TermLength());
                Assert.AreEqual(content, t.Term());
                buf.Append(content);
            }
            Assert.AreEqual(1048576, t.TermLength());
            Assert.AreEqual(1179654, t.TermBuffer().Length);

            // Test for slow growth to a long term
            t   = new Token();
            buf = new System.Text.StringBuilder("a");
            for (int i = 0; i < 20000; i++)
            {
                System.String content = buf.ToString();
                t.SetTermBuffer(content);
                Assert.AreEqual(content.Length, t.TermLength());
                Assert.AreEqual(content, t.Term());
                buf.Append("a");
            }
            Assert.AreEqual(20000, t.TermLength());
            Assert.AreEqual(20167, t.TermBuffer().Length);

            // Test for slow growth to a long term
            t   = new Token();
            buf = new System.Text.StringBuilder("a");
            for (int i = 0; i < 20000; i++)
            {
                System.String content = buf.ToString();
                t.SetTermBuffer(content);
                Assert.AreEqual(content.Length, t.TermLength());
                Assert.AreEqual(content, t.Term());
                buf.Append("a");
            }
            Assert.AreEqual(20000, t.TermLength());
            Assert.AreEqual(20167, t.TermBuffer().Length);
        }
Пример #17
0
		public virtual void  TestMixedStringArray()
		{
			Token t = new Token("hello", 0, 5);
			Assert.AreEqual(t.TermText(), "hello");
			Assert.AreEqual(t.TermLength(), 5);
			Assert.AreEqual(t.Term(), "hello");
			t.SetTermText("hello2");
			Assert.AreEqual(t.TermLength(), 6);
			Assert.AreEqual(t.Term(), "hello2");
			t.SetTermBuffer("hello3".ToCharArray(), 0, 6);
			Assert.AreEqual(t.TermText(), "hello3");
			
			// Make sure if we get the buffer and change a character
			// that termText() reflects the change
			char[] buffer = t.TermBuffer();
			buffer[1] = 'o';
			Assert.AreEqual(t.TermText(), "hollo3");
		}
Пример #18
0
		public virtual void  TestCopyTo()
		{
			Token t = new Token();
			Token copy = (Token) TestSimpleAttributeImpls.AssertCopyIsEqual(t);
			Assert.AreEqual("", t.Term());
			Assert.AreEqual("", copy.Term());
			
			t = new Token(0, 5);
			char[] content = "hello".ToCharArray();
			t.SetTermBuffer(content, 0, 5);
			char[] buf = t.TermBuffer();
			copy = (Token) TestSimpleAttributeImpls.AssertCopyIsEqual(t);
			Assert.AreEqual(t.Term(), copy.Term());
			Assert.AreNotSame(buf, copy.TermBuffer());
			
			Payload pl = new Payload(new byte[]{1, 2, 3, 4});
			t.SetPayload(pl);
			copy = (Token) TestSimpleAttributeImpls.AssertCopyIsEqual(t);
			Assert.AreEqual(pl, copy.GetPayload());
            Assert.AreNotSame(pl, copy.GetPayload());
		}