private void TestTeeSinkCustomToken(int api) { TokenStream stream = new WhitespaceTokenizer(new System.IO.StringReader(doc)); stream = new PartOfSpeechTaggingFilter(stream); stream = new LowerCaseFilter(stream); stream = new StopFilter(stream, stopwords); SinkTokenizer sink = new SinkTokenizer(); TokenStream stream1 = new PartOfSpeechAnnotatingFilter(sink); stream = new TeeTokenFilter(stream, sink); stream = new PartOfSpeechAnnotatingFilter(stream); switch (api) { case 0: ConsumeStreamNewAPI(stream); ConsumeStreamNewAPI(stream1); break; case 1: ConsumeStreamOldAPI(stream); ConsumeStreamOldAPI(stream1); break; case 2: ConsumeStreamVeryOldAPI(stream); ConsumeStreamVeryOldAPI(stream1); break; } }
public virtual void TestMultipleSources() { SinkTokenizer theDetector = new AnonymousClassSinkTokenizer1(this, null); SinkTokenizer dogDetector = new AnonymousClassSinkTokenizer2(this, null); TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())), theDetector), dogDetector)); TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer2.ToString())), theDetector), dogDetector); int i = 0; Token reusableToken = new Token(); for (Token nextToken = source1.Next(reusableToken); nextToken != null; nextToken = source1.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().Equals(tokens1[i]) == true, nextToken.Term() + " is not equal to " + tokens1[i]); i++; } Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length); Assert.IsTrue(theDetector.GetTokens().Count == 2, "theDetector Size: " + theDetector.GetTokens().Count + " is not: " + 2); Assert.IsTrue(dogDetector.GetTokens().Count == 1, "dogDetector Size: " + dogDetector.GetTokens().Count + " is not: " + 1); i = 0; for (Token nextToken = source2.Next(reusableToken); nextToken != null; nextToken = source2.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().Equals(tokens2[i]) == true, nextToken.Term() + " is not equal to " + tokens2[i]); i++; } Assert.IsTrue(i == tokens2.Length, i + " does not equal: " + tokens2.Length); Assert.IsTrue(theDetector.GetTokens().Count == 4, "theDetector Size: " + theDetector.GetTokens().Count + " is not: " + 4); Assert.IsTrue(dogDetector.GetTokens().Count == 2, "dogDetector Size: " + dogDetector.GetTokens().Count + " is not: " + 2); i = 0; for (Token nextToken = theDetector.Next(reusableToken); nextToken != null; nextToken = theDetector.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().ToUpper().Equals("The".ToUpper()) == true, nextToken.Term() + " is not equal to " + "The"); i++; } Assert.IsTrue(i == theDetector.GetTokens().Count, i + " does not equal: " + theDetector.GetTokens().Count); i = 0; for (Token nextToken = dogDetector.Next(reusableToken); nextToken != null; nextToken = dogDetector.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().ToUpper().Equals("Dogs".ToUpper()) == true, nextToken.Term() + " is not equal to " + "Dogs"); i++; } Assert.IsTrue(i == dogDetector.GetTokens().Count, i + " does not equal: " + dogDetector.GetTokens().Count); source1.Reset(); TokenStream lowerCasing = new LowerCaseFilter(source1); i = 0; for (Token nextToken = lowerCasing.Next(reusableToken); nextToken != null; nextToken = lowerCasing.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().Equals(tokens1[i].ToLower()) == true, nextToken.Term() + " is not equal to " + tokens1[i].ToLower()); i++; } Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length); }
public virtual void Test() { SinkTokenizer sink1 = new AnonymousClassSinkTokenizer(this, null); TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())), sink1); int i = 0; Token reusableToken = new Token(); for (Token nextToken = source.Next(reusableToken); nextToken != null; nextToken = source.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().Equals(tokens1[i]) == true, nextToken.Term() + " is not equal to " + tokens1[i]); i++; } Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length); Assert.IsTrue(sink1.GetTokens().Count == 2, "sink1 Size: " + sink1.GetTokens().Count + " is not: " + 2); i = 0; for (Token token = sink1.Next(reusableToken); token != null; token = sink1.Next(reusableToken)) { Assert.IsTrue(token.Term().ToUpper().Equals("The".ToUpper()) == true, token.Term() + " is not equal to " + "The"); i++; } Assert.IsTrue(i == sink1.GetTokens().Count, i + " does not equal: " + sink1.GetTokens().Count); }
/// <summary> Not an explicit test, just useful to print out some info on performance /// /// </summary> /// <throws> Exception </throws> public virtual void Performance() { int[] tokCount = new int[]{100, 500, 1000, 2000, 5000, 10000}; int[] modCounts = new int[]{1, 2, 5, 10, 20, 50, 100, 200, 500}; for (int k = 0; k < tokCount.Length; k++) { System.Text.StringBuilder buffer = new System.Text.StringBuilder(); System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----"); for (int i = 0; i < tokCount[k]; i++) { buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' '); } //make sure we produce the same tokens ModuloSinkTokenizer sink = new ModuloSinkTokenizer(this, tokCount[k], 100); Token reusableToken = new Token(); TokenStream stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), sink); while (stream.Next(reusableToken) != null) { } stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), 100); System.Collections.IList tmp = new System.Collections.ArrayList(); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { tmp.Add(nextToken.Clone()); } System.Collections.IList sinkList = sink.GetTokens(); Assert.IsTrue(tmp.Count == sinkList.Count, "tmp Size: " + tmp.Count + " is not: " + sinkList.Count); for (int i = 0; i < tmp.Count; i++) { Token tfTok = (Token) tmp[i]; Token sinkTok = (Token) sinkList[i]; Assert.IsTrue(tfTok.Term().Equals(sinkTok.Term()) == true, tfTok.Term() + " is not equal to " + sinkTok.Term() + " at token: " + i); } //simulate two fields, each being analyzed once, for 20 documents for (int j = 0; j < modCounts.Length; j++) { int tfPos = 0; long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { stream = new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { tfPos += nextToken.GetPositionIncrement(); } stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), modCounts[j]); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { tfPos += nextToken.GetPositionIncrement(); } } long finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms"); int sinkPos = 0; //simulate one field with one sink start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { sink = new ModuloSinkTokenizer(this, tokCount[k], modCounts[j]); stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), sink); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { sinkPos += nextToken.GetPositionIncrement(); } //System.out.println("Modulo--------"); stream = sink; for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { sinkPos += nextToken.GetPositionIncrement(); } } finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms"); Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos); } System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----"); } }
public virtual void TestMultipleSources() { SinkTokenizer theDetector = new AnonymousClassSinkTokenizer1(this, null); SinkTokenizer dogDetector = new AnonymousClassSinkTokenizer2(this, null); TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())), theDetector), dogDetector)); TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer2.ToString())), theDetector), dogDetector); int i = 0; Token reusableToken = new Token(); for (Token nextToken = source1.Next(reusableToken); nextToken != null; nextToken = source1.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().Equals(tokens1[i]) == true, nextToken.Term() + " is not equal to " + tokens1[i]); i++; } Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length); Assert.IsTrue(theDetector.GetTokens().Count == 2, "theDetector Size: " + theDetector.GetTokens().Count + " is not: " + 2); Assert.IsTrue(dogDetector.GetTokens().Count == 1, "dogDetector Size: " + dogDetector.GetTokens().Count + " is not: " + 1); i = 0; for (Token nextToken = source2.Next(reusableToken); nextToken != null; nextToken = source2.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().Equals(tokens2[i]) == true, nextToken.Term() + " is not equal to " + tokens2[i]); i++; } Assert.IsTrue(i == tokens2.Length, i + " does not equal: " + tokens2.Length); Assert.IsTrue(theDetector.GetTokens().Count == 4, "theDetector Size: " + theDetector.GetTokens().Count + " is not: " + 4); Assert.IsTrue(dogDetector.GetTokens().Count == 2, "dogDetector Size: " + dogDetector.GetTokens().Count + " is not: " + 2); i = 0; for (Token nextToken = theDetector.Next(reusableToken); nextToken != null; nextToken = theDetector.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().ToUpper().Equals("The".ToUpper()) == true, nextToken.Term() + " is not equal to " + "The"); i++; } Assert.IsTrue(i == theDetector.GetTokens().Count, i + " does not equal: " + theDetector.GetTokens().Count); i = 0; for (Token nextToken = dogDetector.Next(reusableToken); nextToken != null; nextToken = dogDetector.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().ToUpper().Equals("Dogs".ToUpper()) == true, nextToken.Term() + " is not equal to " + "Dogs"); i++; } Assert.IsTrue(i == dogDetector.GetTokens().Count, i + " does not equal: " + dogDetector.GetTokens().Count); source1.Reset(); TokenStream lowerCasing = new LowerCaseFilter(source1); i = 0; for (Token nextToken = lowerCasing.Next(reusableToken); nextToken != null; nextToken = lowerCasing.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().Equals(tokens1[i].ToLower()) == true, nextToken.Term() + " is not equal to " + tokens1[i].ToLower()); i++; } Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length); }
public virtual void Test() { SinkTokenizer sink1 = new AnonymousClassSinkTokenizer(this, null); TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())), sink1); int i = 0; Token reusableToken = new Token(); for (Token nextToken = source.Next(reusableToken); nextToken != null; nextToken = source.Next(reusableToken)) { Assert.IsTrue(nextToken.Term().Equals(tokens1[i]) == true, nextToken.Term() + " is not equal to " + tokens1[i]); i++; } Assert.IsTrue(i == tokens1.Length, i + " does not equal: " + tokens1.Length); Assert.IsTrue(sink1.GetTokens().Count == 2, "sink1 Size: " + sink1.GetTokens().Count + " is not: " + 2); i = 0; for (Token token = sink1.Next(reusableToken); token != null; token = sink1.Next(reusableToken)) { Assert.IsTrue(token.Term().ToUpper().Equals("The".ToUpper()) == true, token.Term() + " is not equal to " + "The"); i++; } Assert.IsTrue(i == sink1.GetTokens().Count, i + " does not equal: " + sink1.GetTokens().Count); }
private void TestTeeSinkCustomToken(int api) { TokenStream stream = new WhitespaceTokenizer(new System.IO.StringReader(doc)); stream = new PartOfSpeechTaggingFilter(stream); stream = new LowerCaseFilter(stream); stream = new StopFilter(stream, stopwords); SinkTokenizer sink = new SinkTokenizer(); TokenStream stream1 = new PartOfSpeechAnnotatingFilter(sink); stream = new TeeTokenFilter(stream, sink); stream = new PartOfSpeechAnnotatingFilter(stream); switch (api) { case 0: ConsumeStreamNewAPI(stream); ConsumeStreamNewAPI(stream1); break; case 1: ConsumeStreamOldAPI(stream); ConsumeStreamOldAPI(stream1); break; case 2: ConsumeStreamVeryOldAPI(stream); ConsumeStreamVeryOldAPI(stream1); break; } }
/// <summary> Not an explicit test, just useful to print out some info on performance /// /// </summary> /// <throws> Exception </throws> public virtual void Performance() { int[] tokCount = new int[] { 100, 500, 1000, 2000, 5000, 10000 }; int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 }; for (int k = 0; k < tokCount.Length; k++) { System.Text.StringBuilder buffer = new System.Text.StringBuilder(); System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----"); for (int i = 0; i < tokCount[k]; i++) { buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' '); } //make sure we produce the same tokens ModuloSinkTokenizer sink = new ModuloSinkTokenizer(this, tokCount[k], 100); Token reusableToken = new Token(); TokenStream stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), sink); while (stream.Next(reusableToken) != null) { } stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), 100); System.Collections.IList tmp = new System.Collections.ArrayList(); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { tmp.Add(nextToken.Clone()); } System.Collections.IList sinkList = sink.GetTokens(); Assert.IsTrue(tmp.Count == sinkList.Count, "tmp Size: " + tmp.Count + " is not: " + sinkList.Count); for (int i = 0; i < tmp.Count; i++) { Token tfTok = (Token)tmp[i]; Token sinkTok = (Token)sinkList[i]; Assert.IsTrue(tfTok.Term().Equals(sinkTok.Term()) == true, tfTok.Term() + " is not equal to " + sinkTok.Term() + " at token: " + i); } //simulate two fields, each being analyzed once, for 20 documents for (int j = 0; j < modCounts.Length; j++) { int tfPos = 0; long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { stream = new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { tfPos += nextToken.GetPositionIncrement(); } stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), modCounts[j]); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { tfPos += nextToken.GetPositionIncrement(); } } long finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms"); int sinkPos = 0; //simulate one field with one sink start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { sink = new ModuloSinkTokenizer(this, tokCount[k], modCounts[j]); stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), sink); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { sinkPos += nextToken.GetPositionIncrement(); } //System.out.println("Modulo--------"); stream = sink; for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { sinkPos += nextToken.GetPositionIncrement(); } } finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms"); Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos); } System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----"); } }