public virtual void TestStopListPositions() { bool defaultEnable = StopFilter.GetEnablePositionIncrementsDefault(); StopFilter.SetEnablePositionIncrementsDefault(true); try { System.Collections.Hashtable stopWordsSet = new System.Collections.Hashtable(); stopWordsSet.Add("good", "good"); stopWordsSet.Add("test", "test"); stopWordsSet.Add("analyzer", "analyzer"); StopAnalyzer newStop = new StopAnalyzer(stopWordsSet); System.IO.StringReader reader = new System.IO.StringReader("This is a good test of the english stop analyzer with positions"); int[] expectedIncr = new int[] { 1, 1, 1, 3, 1, 1, 1, 2, 1 }; TokenStream stream = newStop.TokenStream("test", reader); Assert.IsNotNull(stream); int i = 0; TermAttribute termAtt = (TermAttribute)stream.GetAttribute(typeof(TermAttribute)); PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute)stream.AddAttribute(typeof(PositionIncrementAttribute)); while (stream.IncrementToken()) { System.String text = termAtt.Term(); Assert.IsFalse(stopWordsSet.Contains(text)); Assert.AreEqual(expectedIncr[i++], posIncrAtt.GetPositionIncrement()); } } finally { StopFilter.SetEnablePositionIncrementsDefault(defaultEnable); } }
/// <summary> Returns the next input Token whose term() is not a stop word.</summary> public override bool IncrementToken() { // return the first non-stop word found int skippedPositions = 0; while (input.IncrementToken()) { if (!stopWords.Contains(termAtt.TermBuffer(), 0, termAtt.TermLength())) { if (enablePositionIncrements) { posIncrAtt.SetPositionIncrement(posIncrAtt.GetPositionIncrement() + skippedPositions); } return(true); } skippedPositions += posIncrAtt.GetPositionIncrement(); } // reached EOS -- return null return(false); }
private void DoTestStopPositons(StopFilter stpf, bool enableIcrements) { Log("---> test with enable-increments-" + (enableIcrements?"enabled":"disabled")); stpf.SetEnablePositionIncrements(enableIcrements); TermAttribute termAtt = (TermAttribute)stpf.GetAttribute(typeof(TermAttribute)); PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute)stpf.GetAttribute(typeof(PositionIncrementAttribute)); for (int i = 0; i < 20; i += 3) { Assert.IsTrue(stpf.IncrementToken()); Log("Token " + i + ": " + stpf); System.String w = English.IntToEnglish(i).Trim(); Assert.AreEqual(w, termAtt.Term(), "expecting token " + i + " to be " + w); Assert.AreEqual(enableIcrements?(i == 0?1:3):1, posIncrAtt.GetPositionIncrement(), "all but first token must have position increment of 3"); } Assert.IsFalse(stpf.IncrementToken()); }
public override bool IncrementToken() { if (input.IncrementToken()) { System.String token = new System.String(termAtt.TermBuffer(), 0, termAtt.TermLength()); if (!nopayload.Contains(token)) { if (entities.Contains(token)) { payloadAtt.SetPayload(new Payload(System.Text.UTF8Encoding.UTF8.GetBytes(token + ":Entity:" + pos))); } else { payloadAtt.SetPayload(new Payload(System.Text.UTF8Encoding.UTF8.GetBytes(token + ":Noise:" + pos))); } } pos += posIncrAtt.GetPositionIncrement(); return(true); } return(false); }
public virtual void TestStopList() { System.Collections.Hashtable stopWordsSet = new System.Collections.Hashtable(); stopWordsSet.Add("good", "good"); stopWordsSet.Add("test", "test"); stopWordsSet.Add("analyzer", "analyzer"); StopAnalyzer newStop = new StopAnalyzer(stopWordsSet); System.IO.StringReader reader = new System.IO.StringReader("This is a good test of the english stop analyzer"); TokenStream stream = newStop.TokenStream("test", reader); Assert.IsNotNull(stream); TermAttribute termAtt = (TermAttribute)stream.GetAttribute(typeof(TermAttribute)); PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute)stream.AddAttribute(typeof(PositionIncrementAttribute)); while (stream.IncrementToken()) { System.String text = termAtt.Term(); Assert.IsFalse(stopWordsSet.Contains(text)); Assert.AreEqual(1, posIncrAtt.GetPositionIncrement()); // by default stop tokenizer does not apply increments. } }
/// <summary> Not an explicit test, just useful to print out some info on performance /// /// </summary> /// <throws> Exception </throws> public virtual void Performance() { int[] tokCount = new int[] { 100, 500, 1000, 2000, 5000, 10000 }; int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 }; for (int k = 0; k < tokCount.Length; k++) { System.Text.StringBuilder buffer = new System.Text.StringBuilder(); System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----"); for (int i = 0; i < tokCount[k]; i++) { buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' '); } //make sure we produce the same tokens TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString())))); TokenStream sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, 100)); teeStream.ConsumeAllTokens(); TokenStream stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), 100); TermAttribute tfTok = (TermAttribute)stream.AddAttribute(typeof(TermAttribute)); TermAttribute sinkTok = (TermAttribute)sink.AddAttribute(typeof(TermAttribute)); for (int i = 0; stream.IncrementToken(); i++) { Assert.IsTrue(sink.IncrementToken()); Assert.IsTrue(tfTok.Equals(sinkTok) == true, tfTok + " is not equal to " + sinkTok + " at token: " + i); } //simulate two fields, each being analyzed once, for 20 documents for (int j = 0; j < modCounts.Length; j++) { int tfPos = 0; long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { stream = new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))); PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute)stream.GetAttribute(typeof(PositionIncrementAttribute)); while (stream.IncrementToken()) { tfPos += posIncrAtt.GetPositionIncrement(); } stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), modCounts[j]); posIncrAtt = (PositionIncrementAttribute)stream.GetAttribute(typeof(PositionIncrementAttribute)); while (stream.IncrementToken()) { tfPos += posIncrAtt.GetPositionIncrement(); } } long finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms"); int sinkPos = 0; //simulate one field with one sink start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString())))); sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, modCounts[j])); PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute)teeStream.GetAttribute(typeof(PositionIncrementAttribute)); while (teeStream.IncrementToken()) { sinkPos += posIncrAtt.GetPositionIncrement(); } //System.out.println("Modulo--------"); posIncrAtt = (PositionIncrementAttribute)sink.GetAttribute(typeof(PositionIncrementAttribute)); while (sink.IncrementToken()) { sinkPos += posIncrAtt.GetPositionIncrement(); } } finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms"); Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos); } System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----"); } }
public override void ProcessFields(Fieldable[] fields, int count) { fieldState.Reset(docState.doc.GetBoost()); int maxFieldLength = docState.maxFieldLength; bool doInvert = consumer.Start(fields, count); for (int i = 0; i < count; i++) { Fieldable field = fields[i]; // TODO FI: this should be "genericized" to querying // consumer if it wants to see this particular field // tokenized. if (field.IsIndexed() && doInvert) { bool anyToken; if (fieldState.length > 0) { fieldState.position += docState.analyzer.GetPositionIncrementGap(fieldInfo.name); } if (!field.IsTokenized()) { // un-tokenized field System.String stringValue = field.StringValue(); int valueLength = stringValue.Length; perThread.singleTokenTokenStream.Reinit(stringValue, 0, valueLength); fieldState.attributeSource = perThread.singleTokenTokenStream; consumer.Start(field); bool success = false; try { consumer.Add(); success = true; } finally { if (!success) { docState.docWriter.SetAborting(); } } fieldState.offset += valueLength; fieldState.length++; fieldState.position++; anyToken = valueLength > 0; } else { // tokenized field TokenStream stream; TokenStream streamValue = field.TokenStreamValue(); if (streamValue != null) { stream = streamValue; } else { // the field does not have a TokenStream, // so we have to obtain one from the analyzer System.IO.TextReader reader; // find or make Reader System.IO.TextReader readerValue = field.ReaderValue(); if (readerValue != null) { reader = readerValue; } else { System.String stringValue = field.StringValue(); if (stringValue == null) { throw new System.ArgumentException("field must have either TokenStream, String or Reader value"); } perThread.stringReader.Init(stringValue); reader = perThread.stringReader; } // Tokenize field and add to postingTable stream = docState.analyzer.ReusableTokenStream(fieldInfo.name, reader); } // reset the TokenStream to the first token stream.Reset(); int startLength = fieldState.length; // deprecated bool allowMinus1Position = docState.allowMinus1Position; try { int offsetEnd = fieldState.offset - 1; bool hasMoreTokens = stream.IncrementToken(); fieldState.attributeSource = stream; OffsetAttribute offsetAttribute = (OffsetAttribute)fieldState.attributeSource.AddAttribute(typeof(OffsetAttribute)); PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute)fieldState.attributeSource.AddAttribute(typeof(PositionIncrementAttribute)); consumer.Start(field); for (; ;) { // If we hit an exception in stream.next below // (which is fairly common, eg if analyzer // chokes on a given document), then it's // non-aborting and (above) this one document // will be marked as deleted, but still // consume a docID if (!hasMoreTokens) { break; } int posIncr = posIncrAttribute.GetPositionIncrement(); fieldState.position += posIncr; if (allowMinus1Position || fieldState.position > 0) { fieldState.position--; } if (posIncr == 0) { fieldState.numOverlap++; } bool success = false; try { // If we hit an exception in here, we abort // all buffered documents since the last // flush, on the likelihood that the // internal state of the consumer is now // corrupt and should not be flushed to a // new segment: consumer.Add(); success = true; } finally { if (!success) { docState.docWriter.SetAborting(); } } fieldState.position++; offsetEnd = fieldState.offset + offsetAttribute.EndOffset(); if (++fieldState.length >= maxFieldLength) { if (docState.infoStream != null) { docState.infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached for field " + fieldInfo.name + ", ignoring following tokens"); } break; } hasMoreTokens = stream.IncrementToken(); } // trigger streams to perform end-of-stream operations stream.End(); fieldState.offset += offsetAttribute.EndOffset(); anyToken = fieldState.length > startLength; } finally { stream.Close(); } } if (anyToken) { fieldState.offset += docState.analyzer.GetOffsetGap(field); } fieldState.boost *= field.GetBoost(); } // LUCENE-2387: don't hang onto the field, so GC can // reclaim fields[i] = null; } consumer.Finish(); endConsumer.Finish(); }