public virtual void TestMultipleSources() { SinkTokenizer theDetector = new AnonymousClassSinkTokenizer1(this, null); SinkTokenizer dogDetector = new AnonymousClassSinkTokenizer2(this, null); TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())), theDetector), dogDetector)); TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer2.ToString())), theDetector), dogDetector); int i = 0; Token reusableToken = new Token(); for (Token nextToken = source1.Next(reusableToken); nextToken != null; nextToken = source1.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().Equals(tokens1[i]) == true, nextToken.Term() + " is not equal to " + tokens1[i]); i++; } Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length); Assert.IsTrue(theDetector.GetTokens().Count == 2, "theDetector Size: " + theDetector.GetTokens().Count + " is not: " + 2); Assert.IsTrue(dogDetector.GetTokens().Count == 1, "dogDetector Size: " + dogDetector.GetTokens().Count + " is not: " + 1); i = 0; for (Token nextToken = source2.Next(reusableToken); nextToken != null; nextToken = source2.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().Equals(tokens2[i]) == true, nextToken.Term() + " is not equal to " + tokens2[i]); i++; } Assert.IsTrue(i == tokens2.Length, i + " does not equal: " + tokens2.Length); Assert.IsTrue(theDetector.GetTokens().Count == 4, "theDetector Size: " + theDetector.GetTokens().Count + " is not: " + 4); Assert.IsTrue(dogDetector.GetTokens().Count == 2, "dogDetector Size: " + dogDetector.GetTokens().Count + " is not: " + 2); i = 0; for (Token nextToken = theDetector.Next(reusableToken); nextToken != null; nextToken = theDetector.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().ToUpper().Equals("The".ToUpper()) == true, nextToken.Term() + " is not equal to " + "The"); i++; } Assert.IsTrue(i == theDetector.GetTokens().Count, i + " does not equal: " + theDetector.GetTokens().Count); i = 0; for (Token nextToken = dogDetector.Next(reusableToken); nextToken != null; nextToken = dogDetector.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().ToUpper().Equals("Dogs".ToUpper()) == true, nextToken.Term() + " is not equal to " + "Dogs"); i++; } Assert.IsTrue(i == dogDetector.GetTokens().Count, i + " does not equal: " + dogDetector.GetTokens().Count); source1.Reset(); TokenStream lowerCasing = new LowerCaseFilter(source1); i = 0; for (Token nextToken = lowerCasing.Next(reusableToken); nextToken != null; nextToken = lowerCasing.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().Equals(tokens1[i].ToLower()) == true, nextToken.Term() + " is not equal to " + tokens1[i].ToLower()); i++; } Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length); }
public override void Add(Token t) { if (t != null && t.Term().ToUpper().Equals("The".ToUpper())) { base.Add(t); } }
public virtual void TestCtor() { Token t = new Token(); char[] content = "hello".ToCharArray(); t.SetTermBuffer(content, 0, content.Length); char[] buf = t.TermBuffer(); Assert.AreNotEqual(t.TermBuffer(), content); Assert.AreEqual("hello", t.Term()); Assert.AreEqual("word", t.Type()); Assert.AreEqual(0, t.GetFlags()); t = new Token(6, 22); t.SetTermBuffer(content, 0, content.Length); Assert.AreEqual("hello", t.Term()); Assert.AreEqual("(hello,6,22)", t.ToString()); Assert.AreEqual("word", t.Type()); Assert.AreEqual(0, t.GetFlags()); t = new Token(6, 22, 7); t.SetTermBuffer(content, 0, content.Length); Assert.AreEqual("hello", t.Term()); Assert.AreEqual("(hello,6,22)", t.ToString()); Assert.AreEqual(7, t.GetFlags()); t = new Token(6, 22, "junk"); t.SetTermBuffer(content, 0, content.Length); Assert.AreEqual("hello", t.Term()); Assert.AreEqual("(hello,6,22,type=junk)", t.ToString()); Assert.AreEqual(0, t.GetFlags()); }
public virtual void TestMixedStringArray() { Token t = new Token("hello", 0, 5); Assert.AreEqual(t.TermText(), "hello"); Assert.AreEqual(t.TermLength(), 5); Assert.AreEqual(t.Term(), "hello"); t.SetTermText("hello2"); Assert.AreEqual(t.TermLength(), 6); Assert.AreEqual(t.Term(), "hello2"); t.SetTermBuffer("hello3".ToCharArray(), 0, 6); Assert.AreEqual(t.TermText(), "hello3"); // Make sure if we get the buffer and change a character // that termText() reflects the change char[] buffer = t.TermBuffer(); buffer[1] = 'o'; Assert.AreEqual(t.TermText(), "hollo3"); }
public virtual void TestResize() { Token t = new Token(); char[] content = "hello".ToCharArray(); t.SetTermBuffer(content, 0, content.Length); for (int i = 0; i < 2000; i++) { t.ResizeTermBuffer(i); Assert.IsTrue(i <= t.TermBuffer().Length); Assert.AreEqual("hello", t.Term()); } }
public virtual void Test() { SinkTokenizer sink1 = new AnonymousClassSinkTokenizer(this, null); TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())), sink1); int i = 0; Token reusableToken = new Token(); for (Token nextToken = source.Next(reusableToken); nextToken != null; nextToken = source.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().Equals(tokens1[i]) == true, nextToken.Term() + " is not equal to " + tokens1[i]); i++; } Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length); Assert.IsTrue(sink1.GetTokens().Count == 2, "sink1 Size: " + sink1.GetTokens().Count + " is not: " + 2); i = 0; for (Token token = sink1.Next(reusableToken); token != null; token = sink1.Next(reusableToken)) { Assert.IsTrue(token.Term().ToUpper().Equals("The".ToUpper()) == true, token.Term() + " is not equal to " + "The"); i++; } Assert.IsTrue(i == sink1.GetTokens().Count, i + " does not equal: " + sink1.GetTokens().Count); }
public virtual void TestCopyTo() { Token t = new Token(); Token copy = (Token)TestSimpleAttributeImpls.AssertCopyIsEqual(t); Assert.AreEqual("", t.Term()); Assert.AreEqual("", copy.Term()); t = new Token(0, 5); char[] content = "hello".ToCharArray(); t.SetTermBuffer(content, 0, 5); char[] buf = t.TermBuffer(); copy = (Token)TestSimpleAttributeImpls.AssertCopyIsEqual(t); Assert.AreEqual(t.Term(), copy.Term()); Assert.AreNotSame(buf, copy.TermBuffer()); Payload pl = new Payload(new byte[] { 1, 2, 3, 4 }); t.SetPayload(pl); copy = (Token)TestSimpleAttributeImpls.AssertCopyIsEqual(t); Assert.AreEqual(pl, copy.GetPayload()); Assert.AreNotSame(pl, copy.GetPayload()); }
private static void ConsumeStreamOldAPI(TokenStream stream) { stream.Reset(); Token reusableToken = new Token(); int i = 0; while ((reusableToken = stream.Next(reusableToken)) != null) { System.String term = reusableToken.Term(); Payload p = reusableToken.GetPayload(); if (p != null && p.GetData().Length == 1 && p.GetData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION) { Assert.IsTrue("tokenstream".Equals(term), "only TokenStream is a proper noun"); } else { Assert.IsFalse("tokenstream".Equals(term), "all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)"); } Assert.AreEqual(results[i], term); i++; } }
public virtual void TestGrow() { Token t = new Token(); System.Text.StringBuilder buf = new System.Text.StringBuilder("ab"); for (int i = 0; i < 20; i++) { char[] content = buf.ToString().ToCharArray(); t.SetTermBuffer(content, 0, content.Length); Assert.AreEqual(buf.Length, t.TermLength()); Assert.AreEqual(buf.ToString(), t.Term()); buf.Append(buf.ToString()); } Assert.AreEqual(1048576, t.TermLength()); Assert.AreEqual(1179654, t.TermBuffer().Length); // now as a string, first variant t = new Token(); buf = new System.Text.StringBuilder("ab"); for (int i = 0; i < 20; i++) { System.String content = buf.ToString(); t.SetTermBuffer(content, 0, content.Length); Assert.AreEqual(content.Length, t.TermLength()); Assert.AreEqual(content, t.Term()); buf.Append(content); } Assert.AreEqual(1048576, t.TermLength()); Assert.AreEqual(1179654, t.TermBuffer().Length); // now as a string, second variant t = new Token(); buf = new System.Text.StringBuilder("ab"); for (int i = 0; i < 20; i++) { System.String content = buf.ToString(); t.SetTermBuffer(content); Assert.AreEqual(content.Length, t.TermLength()); Assert.AreEqual(content, t.Term()); buf.Append(content); } Assert.AreEqual(1048576, t.TermLength()); Assert.AreEqual(1179654, t.TermBuffer().Length); // Test for slow growth to a long term t = new Token(); buf = new System.Text.StringBuilder("a"); for (int i = 0; i < 20000; i++) { System.String content = buf.ToString(); t.SetTermBuffer(content); Assert.AreEqual(content.Length, t.TermLength()); Assert.AreEqual(content, t.Term()); buf.Append("a"); } Assert.AreEqual(20000, t.TermLength()); Assert.AreEqual(20167, t.TermBuffer().Length); // Test for slow growth to a long term t = new Token(); buf = new System.Text.StringBuilder("a"); for (int i = 0; i < 20000; i++) { System.String content = buf.ToString(); t.SetTermBuffer(content); Assert.AreEqual(content.Length, t.TermLength()); Assert.AreEqual(content, t.Term()); buf.Append("a"); } Assert.AreEqual(20000, t.TermLength()); Assert.AreEqual(20167, t.TermBuffer().Length); }
// TermAttribute: public System.String Term() { return(delegate_Renamed.Term()); }
/// <summary> Not an explicit test, just useful to print out some info on performance /// /// </summary> /// <throws> Exception </throws> public virtual void Performance() { int[] tokCount = new int[] { 100, 500, 1000, 2000, 5000, 10000 }; int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 }; for (int k = 0; k < tokCount.Length; k++) { System.Text.StringBuilder buffer = new System.Text.StringBuilder(); System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----"); for (int i = 0; i < tokCount[k]; i++) { buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' '); } //make sure we produce the same tokens ModuloSinkTokenizer sink = new ModuloSinkTokenizer(this, tokCount[k], 100); Token reusableToken = new Token(); TokenStream stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), sink); while (stream.Next(reusableToken) != null) { } stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), 100); System.Collections.IList tmp = new System.Collections.ArrayList(); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { tmp.Add(nextToken.Clone()); } System.Collections.IList sinkList = sink.GetTokens(); Assert.IsTrue(tmp.Count == sinkList.Count, "tmp Size: " + tmp.Count + " is not: " + sinkList.Count); for (int i = 0; i < tmp.Count; i++) { Token tfTok = (Token)tmp[i]; Token sinkTok = (Token)sinkList[i]; Assert.IsTrue(tfTok.Term().Equals(sinkTok.Term()) == true, tfTok.Term() + " is not equal to " + sinkTok.Term() + " at token: " + i); } //simulate two fields, each being analyzed once, for 20 documents for (int j = 0; j < modCounts.Length; j++) { int tfPos = 0; long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { stream = new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { tfPos += nextToken.GetPositionIncrement(); } stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), modCounts[j]); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { tfPos += nextToken.GetPositionIncrement(); } } long finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms"); int sinkPos = 0; //simulate one field with one sink start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { sink = new ModuloSinkTokenizer(this, tokCount[k], modCounts[j]); stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), sink); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { sinkPos += nextToken.GetPositionIncrement(); } //System.out.println("Modulo--------"); stream = sink; for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { sinkPos += nextToken.GetPositionIncrement(); } } finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms"); Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos); } System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----"); } }
public virtual void TestCopyTo() { Token t = new Token(); Token copy = (Token) TestSimpleAttributeImpls.AssertCopyIsEqual(t); Assert.AreEqual("", t.Term()); Assert.AreEqual("", copy.Term()); t = new Token(0, 5); char[] content = "hello".ToCharArray(); t.SetTermBuffer(content, 0, 5); char[] buf = t.TermBuffer(); copy = (Token) TestSimpleAttributeImpls.AssertCopyIsEqual(t); Assert.AreEqual(t.Term(), copy.Term()); Assert.AreNotSame(buf, copy.TermBuffer()); Payload pl = new Payload(new byte[]{1, 2, 3, 4}); t.SetPayload(pl); copy = (Token) TestSimpleAttributeImpls.AssertCopyIsEqual(t); Assert.AreEqual(pl, copy.GetPayload()); Assert.AreNotSame(pl, copy.GetPayload()); }