public virtual void TestMultipleSources() { TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString()))); TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.NewSinkTokenStream(dogFilter); TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.NewSinkTokenStream(theFilter); TokenStream source1 = new CachingTokenFilter(tee1); tee1.AddAttribute <ICheckClearAttributesAttribute>(); dogDetector.AddAttribute <ICheckClearAttributesAttribute>(); theDetector.AddAttribute <ICheckClearAttributesAttribute>(); TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer2.ToString()))); tee2.AddSinkTokenStream(dogDetector); tee2.AddSinkTokenStream(theDetector); TokenStream source2 = tee2; AssertTokenStreamContents(source1, tokens1); AssertTokenStreamContents(source2, tokens2); AssertTokenStreamContents(theDetector, new String[] { "The", "the", "The", "the" }); source1.Reset(); TokenStream lowerCasing = new LowerCaseFilter(source1); String[] lowerCaseTokens = new String[tokens1.Length]; for (int i = 0; i < tokens1.Length; i++) { lowerCaseTokens[i] = tokens1[i].ToLower(); } }
public virtual void TestGeneral() { TeeSinkTokenFilter source = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString()))); TokenStream sink1 = source.NewSinkTokenStream(); TokenStream sink2 = source.NewSinkTokenStream(theFilter); source.AddAttribute <ICheckClearAttributesAttribute>(); sink1.AddAttribute <ICheckClearAttributesAttribute>(); sink2.AddAttribute <ICheckClearAttributesAttribute>(); AssertTokenStreamContents(source, tokens1); AssertTokenStreamContents(sink1, tokens1); }
public virtual void TestGeneral() { TeeSinkTokenFilter source = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString()))); TokenStream sink1 = source.NewSinkTokenStream(); TokenStream sink2 = source.NewSinkTokenStream(theFilter); source.AddAttribute(typeof(CheckClearAttributesAttribute)); sink1.AddAttribute(typeof(CheckClearAttributesAttribute)); sink2.AddAttribute(typeof(CheckClearAttributesAttribute)); AssertTokenStreamContents(source, tokens1); AssertTokenStreamContents(sink1, tokens1); }
/// <summary> Not an explicit test, just useful to print out some info on performance /// /// </summary> /// <throws> Exception </throws> public virtual void Performance() { int[] tokCount = new int[] { 100, 500, 1000, 2000, 5000, 10000 }; int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 }; for (int k = 0; k < tokCount.Length; k++) { System.Text.StringBuilder buffer = new System.Text.StringBuilder(); System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----"); for (int i = 0; i < tokCount[k]; i++) { buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' '); } //make sure we produce the same tokens TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString())))); TokenStream sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, 100)); teeStream.ConsumeAllTokens(); TokenStream stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))), 100); ITermAttribute tfTok = stream.AddAttribute <ITermAttribute>(); ITermAttribute sinkTok = sink.AddAttribute <ITermAttribute>(); for (int i = 0; stream.IncrementToken(); i++) { Assert.IsTrue(sink.IncrementToken()); Assert.IsTrue(tfTok.Equals(sinkTok) == true, tfTok + " is not equal to " + sinkTok + " at token: " + i); } //simulate two fields, each being analyzed once, for 20 documents for (int j = 0; j < modCounts.Length; j++) { int tfPos = 0; long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { stream = new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))); IPositionIncrementAttribute posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>(); while (stream.IncrementToken()) { tfPos += posIncrAtt.PositionIncrement; } stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))), modCounts[j]); posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>(); while (stream.IncrementToken()) { tfPos += posIncrAtt.PositionIncrement; } } long finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms"); int sinkPos = 0; //simulate one field with one sink start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString())))); sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, modCounts[j])); IPositionIncrementAttribute posIncrAtt = teeStream.GetAttribute <IPositionIncrementAttribute>(); while (teeStream.IncrementToken()) { sinkPos += posIncrAtt.PositionIncrement; } //System.out.println("Modulo--------"); posIncrAtt = sink.GetAttribute <IPositionIncrementAttribute>(); while (sink.IncrementToken()) { sinkPos += posIncrAtt.PositionIncrement; } } finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms"); Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos); } System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----"); } }
public virtual void TestMultipleSources() { TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString()))); TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.NewSinkTokenStream(dogFilter); TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.NewSinkTokenStream(theFilter); TokenStream source1 = new CachingTokenFilter(tee1); tee1.AddAttribute(typeof(CheckClearAttributesAttribute)); dogDetector.AddAttribute(typeof(CheckClearAttributesAttribute)); theDetector.AddAttribute(typeof(CheckClearAttributesAttribute)); TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer2.ToString()))); tee2.AddSinkTokenStream(dogDetector); tee2.AddSinkTokenStream(theDetector); TokenStream source2 = tee2; AssertTokenStreamContents(source1, tokens1); AssertTokenStreamContents(source2, tokens2); AssertTokenStreamContents(theDetector, new String[] { "The", "the", "The", "the" }); source1.Reset(); TokenStream lowerCasing = new LowerCaseFilter(source1); String[] lowerCaseTokens = new String[tokens1.Length]; for (int i = 0; i < tokens1.Length; i++) lowerCaseTokens[i] = tokens1[i].ToLower(); }
/// <summary> Not an explicit test, just useful to print out some info on performance /// /// </summary> /// <throws> Exception </throws> public virtual void Performance() { int[] tokCount = new int[]{100, 500, 1000, 2000, 5000, 10000}; int[] modCounts = new int[]{1, 2, 5, 10, 20, 50, 100, 200, 500}; for (int k = 0; k < tokCount.Length; k++) { System.Text.StringBuilder buffer = new System.Text.StringBuilder(); System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----"); for (int i = 0; i < tokCount[k]; i++) { buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' '); } //make sure we produce the same tokens TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString())))); TokenStream sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, 100)); teeStream.ConsumeAllTokens(); TokenStream stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), 100); TermAttribute tfTok = (TermAttribute) stream.AddAttribute(typeof(TermAttribute)); TermAttribute sinkTok = (TermAttribute) sink.AddAttribute(typeof(TermAttribute)); for (int i = 0; stream.IncrementToken(); i++) { Assert.IsTrue(sink.IncrementToken()); Assert.IsTrue(tfTok.Equals(sinkTok) == true, tfTok + " is not equal to " + sinkTok + " at token: " + i); } //simulate two fields, each being analyzed once, for 20 documents for (int j = 0; j < modCounts.Length; j++) { int tfPos = 0; long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { stream = new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))); PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.GetAttribute(typeof(PositionIncrementAttribute)); while (stream.IncrementToken()) { tfPos += posIncrAtt.GetPositionIncrement(); } stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), modCounts[j]); posIncrAtt = (PositionIncrementAttribute) stream.GetAttribute(typeof(PositionIncrementAttribute)); while (stream.IncrementToken()) { tfPos += posIncrAtt.GetPositionIncrement(); } } long finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms"); int sinkPos = 0; //simulate one field with one sink start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString())))); sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, modCounts[j])); PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) teeStream.GetAttribute(typeof(PositionIncrementAttribute)); while (teeStream.IncrementToken()) { sinkPos += posIncrAtt.GetPositionIncrement(); } //System.out.println("Modulo--------"); posIncrAtt = (PositionIncrementAttribute) sink.GetAttribute(typeof(PositionIncrementAttribute)); while (sink.IncrementToken()) { sinkPos += posIncrAtt.GetPositionIncrement(); } } finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms"); Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos); } System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----"); } }