private void FillCache(/* in */ Token reusableToken) { for (Token nextToken = input.Next(reusableToken); nextToken != null; nextToken = input.Next(reusableToken)) { cache.Add(nextToken.Clone()); } }
/// <summary> Override this method to cache only certain tokens, or new tokens based /// on the old tokens. /// /// </summary> /// <param name="t">The {@link Lucene.Net.Analysis.Token} to add to the sink /// </param> public virtual void Add(Token t) { if (t == null) { return; } lst.Add((Token)t.Clone()); }
public SingleTokenTokenStream(Token token) { Debug.Assert(token != null, "Token was null!"); _singleToken = (Token) token.Clone(); // ReSharper disable DoNotCallOverridableMethodsInConstructor _tokenAtt = (AttributeImpl) AddAttribute(typeof (TermAttribute)); // ReSharper restore DoNotCallOverridableMethodsInConstructor Debug.Assert(_tokenAtt is Token || _tokenAtt.GetType().Name.Equals(typeof (TokenWrapper).Name), "Token Attribute is the wrong type! Type was: " + _tokenAtt.GetType().Name + " but expected " + typeof (TokenWrapper).Name); }
/// <summary> Returns the next token out of the list of cached tokens</summary> /// <returns> The next {@link Lucene.Net.Analysis.Token} in the Sink. /// </returns> /// <throws> IOException </throws> public override Token Next(/* in */ Token reusableToken) { System.Diagnostics.Debug.Assert(reusableToken != null); if (iter == null) { iter = lst.GetEnumerator(); } if (iter.MoveNext()) { Token nextToken = (Token)iter.Current; return((Token)nextToken.Clone()); } return(null); }
public override Token Next(Token reusableToken) { System.Diagnostics.Debug.Assert(reusableToken != null); if (iter == null) { iter = lst.GetEnumerator(); } // Since this TokenStream can be reset we have to maintain the tokens as immutable if (iter.MoveNext()) { Token nextToken = iter.Current; return((Token)nextToken.Clone()); } return(null); }
public override Token Next(/* in */ Token reusableToken) { System.Diagnostics.Debug.Assert(reusableToken != null); if (cache == null) { // fill cache lazily cache = new System.Collections.ArrayList(); FillCache(reusableToken); iterator = cache.GetEnumerator(); } if (!iterator.MoveNext()) { // the cache is exhausted, return null return(null); } Token nextToken = (Token)iterator.Current; return((Token)nextToken.Clone()); }
/// <summary> Override this method to cache only certain tokens, or new tokens based /// on the old tokens. /// /// </summary> /// <param name="t">The {@link Lucene.Net.Analysis.Token} to add to the sink /// </param> public virtual void Add(Token t) { if (t == null) return ; lst.Add((Token) t.Clone()); }
public override System.Object Clone() { return(new TokenWrapper((Token)delegate_Renamed.Clone())); }
/// <summary> Not an explicit test, just useful to print out some info on performance /// /// </summary> /// <throws> Exception </throws> public virtual void Performance() { int[] tokCount = new int[] { 100, 500, 1000, 2000, 5000, 10000 }; int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 }; for (int k = 0; k < tokCount.Length; k++) { System.Text.StringBuilder buffer = new System.Text.StringBuilder(); System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----"); for (int i = 0; i < tokCount[k]; i++) { buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' '); } //make sure we produce the same tokens ModuloSinkTokenizer sink = new ModuloSinkTokenizer(this, tokCount[k], 100); Token reusableToken = new Token(); TokenStream stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), sink); while (stream.Next(reusableToken) != null) { } stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), 100); System.Collections.IList tmp = new System.Collections.ArrayList(); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { tmp.Add(nextToken.Clone()); } System.Collections.IList sinkList = sink.GetTokens(); Assert.IsTrue(tmp.Count == sinkList.Count, "tmp Size: " + tmp.Count + " is not: " + sinkList.Count); for (int i = 0; i < tmp.Count; i++) { Token tfTok = (Token)tmp[i]; Token sinkTok = (Token)sinkList[i]; Assert.IsTrue(tfTok.Term().Equals(sinkTok.Term()) == true, tfTok.Term() + " is not equal to " + sinkTok.Term() + " at token: " + i); } //simulate two fields, each being analyzed once, for 20 documents for (int j = 0; j < modCounts.Length; j++) { int tfPos = 0; long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { stream = new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { tfPos += nextToken.GetPositionIncrement(); } stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), modCounts[j]); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { tfPos += nextToken.GetPositionIncrement(); } } long finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms"); int sinkPos = 0; //simulate one field with one sink start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { sink = new ModuloSinkTokenizer(this, tokCount[k], modCounts[j]); stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), sink); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { sinkPos += nextToken.GetPositionIncrement(); } //System.out.println("Modulo--------"); stream = sink; for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { sinkPos += nextToken.GetPositionIncrement(); } } finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms"); Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos); } System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----"); } }
public void SetToken(Token token) { _singleToken = (Token) token.Clone(); }
public override void Add(Token t) { if (t != null && count % modCount == 0) { lst.Add(t.Clone()); } count++; }
public virtual void TestPerformance() { int[] tokCount = new int[]{100, 500, 1000, 2000, 5000, 10000}; int[] modCounts = new int[]{1, 2, 5, 10, 20, 50, 100, 200, 500}; for (int k = 0; k < tokCount.Length; k++) { System.Text.StringBuilder buffer = new System.Text.StringBuilder(); System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----"); for (int i = 0; i < tokCount[k]; i++) { buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' '); } //make sure we produce the same tokens ModuloSinkTokenizer sink = new ModuloSinkTokenizer(this, tokCount[k], 100); Token next = new Token(); TokenStream result = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), sink); while ((next = result.Next(next)) != null) { } result = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), 100); next = new Token(); System.Collections.IList tmp = new System.Collections.ArrayList(); while ((next = result.Next(next)) != null) { tmp.Add(next.Clone()); } System.Collections.IList sinkList = sink.GetTokens(); Assert.IsTrue(tmp.Count == sinkList.Count, "tmp Size: " + tmp.Count + " is not: " + sinkList.Count); for (int i = 0; i < tmp.Count; i++) { Token tfTok = (Token) tmp[i]; Token sinkTok = (Token) sinkList[i]; Assert.IsTrue(tfTok.TermText().Equals(sinkTok.TermText()) == true, tfTok.TermText() + " is not equal to " + sinkTok.TermText() + " at token: " + i); } //simulate two fields, each being analyzed once, for 20 documents for (int j = 0; j < modCounts.Length; j++) { int tfPos = 0; long start = (System.DateTime.Now.Ticks - 621355968000000000) / 10000; for (int i = 0; i < 20; i++) { next = new Token(); result = new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))); while ((next = result.Next(next)) != null) { tfPos += next.GetPositionIncrement(); } next = new Token(); result = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), modCounts[j]); while ((next = result.Next(next)) != null) { tfPos += next.GetPositionIncrement(); } } long finish = (System.DateTime.Now.Ticks - 621355968000000000) / 10000; System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms"); int sinkPos = 0; //simulate one field with one sink start = (System.DateTime.Now.Ticks - 621355968000000000) / 10000; for (int i = 0; i < 20; i++) { sink = new ModuloSinkTokenizer(this, tokCount[k], modCounts[j]); next = new Token(); result = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), sink); while ((next = result.Next(next)) != null) { sinkPos += next.GetPositionIncrement(); } //System.out.println("Modulo--------"); result = sink; while ((next = result.Next(next)) != null) { sinkPos += next.GetPositionIncrement(); } } finish = (System.DateTime.Now.Ticks - 621355968000000000) / 10000; System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms"); Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos); } System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----"); } }