private void Parse() { //System.out.println("\nS: parse"); Debug.Assert(inputSkipCount == 0); int curNextRead = nextRead; // Holds the longest match we've seen so far: BytesRef matchOutput = null; int matchInputLength = 0; int matchEndOffset = -1; BytesRef pendingOutput = fst.Outputs.NoOutput; fst.GetFirstArc(scratchArc); Debug.Assert(scratchArc.Output == fst.Outputs.NoOutput); int tokenCount = 0; while (true) { // Pull next token's chars: char[] buffer; int bufferLen; //System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite); int inputEndOffset = 0; if (curNextRead == nextWrite) { // We used up our lookahead buffer of input tokens // -- pull next real input token: if (finished) { break; } else { //System.out.println(" input.incrToken"); Debug.Assert(futureInputs[nextWrite].consumed); // Not correct: a syn match whose output is longer // than its input can set future inputs keepOrig // to true: //assert !futureInputs[nextWrite].keepOrig; if (input.IncrementToken()) { buffer = termAtt.Buffer(); bufferLen = termAtt.Length; PendingInput pendingInput = futureInputs[nextWrite]; lastStartOffset = pendingInput.startOffset = offsetAtt.StartOffset(); lastEndOffset = pendingInput.endOffset = offsetAtt.EndOffset(); inputEndOffset = pendingInput.endOffset; //System.out.println(" new token=" + new String(buffer, 0, bufferLen)); if (nextRead != nextWrite) { Capture(); } else { pendingInput.consumed = false; } } else { // No more input tokens //System.out.println(" set end"); finished = true; break; } } } else { // Still in our lookahead buffer = futureInputs[curNextRead].term.Chars; bufferLen = futureInputs[curNextRead].term.Length; inputEndOffset = futureInputs[curNextRead].endOffset; //System.out.println(" old token=" + new String(buffer, 0, bufferLen)); } tokenCount++; // Run each char in this token through the FST: int bufUpto = 0; while (bufUpto < bufferLen) { int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen); if (fst.FindTargetArc(ignoreCase ? Character.ToLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { //System.out.println(" stop"); goto byTokenBreak; } // Accum the output pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output); //System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output); bufUpto += Character.CharCount(codePoint); } // OK, entire token matched; now see if this is a final // state: if (scratchArc.Final) { matchOutput = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput); matchInputLength = tokenCount; matchEndOffset = inputEndOffset; //System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput); } // See if the FST wants to continue matching (ie, needs to // see the next input token): if (fst.FindTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) { // No further rules can match here; we're done // searching for matching rules starting at the // current input position. break; } else { // More matching is possible -- accum the output (if // any) of the WORD_SEP arc: pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output); if (nextRead == nextWrite) { Capture(); } } curNextRead = RollIncr(curNextRead); } byTokenBreak: if (nextRead == nextWrite && !finished) { //System.out.println(" skip write slot=" + nextWrite); nextWrite = RollIncr(nextWrite); } if (matchOutput != null) { //System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput); inputSkipCount = matchInputLength; AddOutput(matchOutput, matchInputLength, matchEndOffset); } else if (nextRead != nextWrite) { // Even though we had no match here, we set to 1 // because we need to skip current input token before // trying to match again: inputSkipCount = 1; } else { Debug.Assert(finished); } //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite); }
public override void ProcessFields(IndexableField[] fields, int count) { FieldState.Reset(); bool doInvert = Consumer.Start(fields, count); for (int i = 0; i < count; i++) { IndexableField field = fields[i]; IndexableFieldType fieldType = field.FieldType(); // TODO FI: this should be "genericized" to querying // consumer if it wants to see this particular field // tokenized. if (fieldType.Indexed && doInvert) { bool analyzed = fieldType.Tokenized && DocState.Analyzer != null; // if the field omits norms, the boost cannot be indexed. if (fieldType.OmitNorms && field.GetBoost() != 1.0f) { throw new System.NotSupportedException("You cannot set an index-time boost: norms are omitted for field '" + field.Name() + "'"); } // only bother checking offsets if something will consume them. // TODO: after we fix analyzers, also check if termVectorOffsets will be indexed. bool checkOffsets = fieldType.IndexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; int lastStartOffset = 0; if (i > 0) { FieldState.Position_Renamed += analyzed ? DocState.Analyzer.GetPositionIncrementGap(fieldInfo.Name) : 0; } /* * To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream * when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses, * but rather a finally that takes note of the problem. */ bool succeededInProcessingField = false; TokenStream stream = field.GetTokenStream(DocState.Analyzer); // reset the TokenStream to the first token stream.Reset(); try { bool hasMoreTokens = stream.IncrementToken(); FieldState.AttributeSource_Renamed = stream; IOffsetAttribute offsetAttribute = FieldState.AttributeSource_Renamed.AddAttribute <IOffsetAttribute>(); IPositionIncrementAttribute posIncrAttribute = FieldState.AttributeSource_Renamed.AddAttribute <IPositionIncrementAttribute>(); if (hasMoreTokens) { Consumer.Start(field); do { // If we hit an exception in stream.next below // (which is fairly common, eg if analyzer // chokes on a given document), then it's // non-aborting and (above) this one document // will be marked as deleted, but still // consume a docID int posIncr = posIncrAttribute.PositionIncrement; if (posIncr < 0) { throw new System.ArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.Name() + "'"); } if (FieldState.Position_Renamed == 0 && posIncr == 0) { throw new System.ArgumentException("first position increment must be > 0 (got 0) for field '" + field.Name() + "'"); } int position = FieldState.Position_Renamed + posIncr; if (position > 0) { // NOTE: confusing: this "mirrors" the // position++ we do below position--; } else if (position < 0) { throw new System.ArgumentException("position overflow for field '" + field.Name() + "'"); } // position is legal, we can safely place it in fieldState now. // not sure if anything will use fieldState after non-aborting exc... FieldState.Position_Renamed = position; if (posIncr == 0) { FieldState.NumOverlap_Renamed++; } if (checkOffsets) { int startOffset = FieldState.Offset_Renamed + offsetAttribute.StartOffset(); int endOffset = FieldState.Offset_Renamed + offsetAttribute.EndOffset(); if (startOffset < 0 || endOffset < startOffset) { throw new System.ArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, " + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.Name() + "'"); } if (startOffset < lastStartOffset) { throw new System.ArgumentException("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.Name() + "'"); } lastStartOffset = startOffset; } bool success = false; try { // If we hit an exception in here, we abort // all buffered documents since the last // flush, on the likelihood that the // internal state of the consumer is now // corrupt and should not be flushed to a // new segment: Consumer.Add(); success = true; } finally { if (!success) { DocState.DocWriter.SetAborting(); } } FieldState.Length_Renamed++; FieldState.Position_Renamed++; } while (stream.IncrementToken()); } // trigger streams to perform end-of-stream operations stream.End(); // TODO: maybe add some safety? then again, its already checked // when we come back around to the field... FieldState.Position_Renamed += posIncrAttribute.PositionIncrement; FieldState.Offset_Renamed += offsetAttribute.EndOffset(); if (DocState.MaxTermPrefix != null) { string msg = "Document contains at least one immense term in field=\"" + fieldInfo.Name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + DocState.MaxTermPrefix + "...'"; if (DocState.InfoStream.IsEnabled("IW")) { DocState.InfoStream.Message("IW", "ERROR: " + msg); } DocState.MaxTermPrefix = null; throw new System.ArgumentException(msg); } /* if success was false above there is an exception coming through and we won't get here.*/ succeededInProcessingField = true; } finally { if (!succeededInProcessingField) { IOUtils.CloseWhileHandlingException(stream); } else { stream.Dispose(); } if (!succeededInProcessingField && DocState.InfoStream.IsEnabled("DW")) { DocState.InfoStream.Message("DW", "An exception was thrown while processing field " + fieldInfo.Name); } } FieldState.Offset_Renamed += analyzed ? DocState.Analyzer.GetOffsetGap(fieldInfo.Name) : 0; FieldState.Boost_Renamed *= field.GetBoost(); } // LUCENE-2387: don't hang onto the field, so GC can // reclaim fields[i] = null; } Consumer.Finish(); EndConsumer.Finish(); }
// offsetsAreCorrect also validates: // - graph offsets are correct (all tokens leaving from // pos X have the same startOffset; all tokens // arriving to pos Y have the same endOffset) // - offsets only move forwards (startOffset >= // lastStartOffset) public static void AssertTokenStreamContents(TokenStream ts, string[] output, int[] startOffsets, int[] endOffsets, string[] types, int[] posIncrements, int[] posLengths, int?finalOffset, int?finalPosInc, bool[] keywordAtts, bool offsetsAreCorrect) { Assert.IsNotNull(output); var checkClearAtt = ts.AddAttribute <ICheckClearAttributesAttribute>(); ICharTermAttribute termAtt = null; if (output.Length > 0) { Assert.IsTrue(ts.HasAttribute <ICharTermAttribute>(), "has no CharTermAttribute"); termAtt = ts.GetAttribute <ICharTermAttribute>(); } IOffsetAttribute offsetAtt = null; if (startOffsets != null || endOffsets != null || finalOffset != null) { Assert.IsTrue(ts.HasAttribute <IOffsetAttribute>(), "has no OffsetAttribute"); offsetAtt = ts.GetAttribute <IOffsetAttribute>(); } ITypeAttribute typeAtt = null; if (types != null) { Assert.IsTrue(ts.HasAttribute <ITypeAttribute>(), "has no TypeAttribute"); typeAtt = ts.GetAttribute <ITypeAttribute>(); } IPositionIncrementAttribute posIncrAtt = null; if (posIncrements != null || finalPosInc != null) { Assert.IsTrue(ts.HasAttribute <IPositionIncrementAttribute>(), "has no PositionIncrementAttribute"); posIncrAtt = ts.GetAttribute <IPositionIncrementAttribute>(); } IPositionLengthAttribute posLengthAtt = null; if (posLengths != null) { Assert.IsTrue(ts.HasAttribute <IPositionLengthAttribute>(), "has no PositionLengthAttribute"); posLengthAtt = ts.GetAttribute <IPositionLengthAttribute>(); } IKeywordAttribute keywordAtt = null; if (keywordAtts != null) { Assert.IsTrue(ts.HasAttribute <IKeywordAttribute>(), "has no KeywordAttribute"); keywordAtt = ts.GetAttribute <IKeywordAttribute>(); } // Maps position to the start/end offset: IDictionary <int?, int?> posToStartOffset = new Dictionary <int?, int?>(); IDictionary <int?, int?> posToEndOffset = new Dictionary <int?, int?>(); ts.Reset(); int pos = -1; int lastStartOffset = 0; for (int i = 0; i < output.Length; i++) { // extra safety to enforce, that the state is not preserved and also assign bogus values ts.ClearAttributes(); termAtt.SetEmpty().Append("bogusTerm"); if (offsetAtt != null) { offsetAtt.SetOffset(14584724, 24683243); } if (typeAtt != null) { typeAtt.Type = "bogusType"; } if (posIncrAtt != null) { posIncrAtt.PositionIncrement = 45987657; } if (posLengthAtt != null) { posLengthAtt.PositionLength = 45987653; } if (keywordAtt != null) { keywordAtt.Keyword = (i & 1) == 0; } bool reset = checkClearAtt.AndResetClearCalled; // reset it, because we called clearAttribute() before Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist"); Assert.IsTrue(reset, "ClearAttributes() was not called correctly in TokenStream chain"); Assert.AreEqual(output[i], termAtt.ToString(), "term " + i + ", output[i] = " + output[i] + ", termAtt = " + termAtt.ToString()); if (startOffsets != null) { Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i); } if (endOffsets != null) { Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i); } if (types != null) { Assert.AreEqual(types[i], typeAtt.Type, "type " + i); } if (posIncrements != null) { Assert.AreEqual(posIncrements[i], posIncrAtt.PositionIncrement, "posIncrement " + i); } if (posLengths != null) { Assert.AreEqual(posLengths[i], posLengthAtt.PositionLength, "posLength " + i); } if (keywordAtts != null) { Assert.AreEqual(keywordAtts[i], keywordAtt.Keyword, "keywordAtt " + i); } // we can enforce some basic things about a few attributes even if the caller doesn't check: if (offsetAtt != null) { int startOffset = offsetAtt.StartOffset(); int endOffset = offsetAtt.EndOffset(); if (finalOffset != null) { Assert.IsTrue(startOffset <= (int)finalOffset, "startOffset must be <= finalOffset"); Assert.IsTrue(endOffset <= (int)finalOffset, "endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + (int)finalOffset); } if (offsetsAreCorrect) { Assert.IsTrue(offsetAtt.StartOffset() >= lastStartOffset, "offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset); lastStartOffset = offsetAtt.StartOffset(); } if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) { // Validate offset consistency in the graph, ie // all tokens leaving from a certain pos have the // same startOffset, and all tokens arriving to a // certain pos have the same endOffset: int posInc = posIncrAtt.PositionIncrement; pos += posInc; int posLength = posLengthAtt.PositionLength; if (!posToStartOffset.ContainsKey(pos)) { // First time we've seen a token leaving from this position: posToStartOffset[pos] = startOffset; //System.out.println(" + s " + pos + " -> " + startOffset); } else { // We've seen a token leaving from this position // before; verify the startOffset is the same: //System.out.println(" + vs " + pos + " -> " + startOffset); Assert.AreEqual((int)posToStartOffset[pos], startOffset, "pos=" + pos + " posLen=" + posLength + " token=" + termAtt); } int endPos = pos + posLength; if (!posToEndOffset.ContainsKey(endPos)) { // First time we've seen a token arriving to this position: posToEndOffset[endPos] = endOffset; //System.out.println(" + e " + endPos + " -> " + endOffset); } else { // We've seen a token arriving to this position // before; verify the endOffset is the same: //System.out.println(" + ve " + endPos + " -> " + endOffset); Assert.AreEqual((int)posToEndOffset[endPos], endOffset, "pos=" + pos + " posLen=" + posLength + " token=" + termAtt); } } } if (posIncrAtt != null) { if (i == 0) { Assert.IsTrue(posIncrAtt.PositionIncrement >= 1, "first posIncrement must be >= 1"); } else { Assert.IsTrue(posIncrAtt.PositionIncrement >= 0, "posIncrement must be >= 0"); } } if (posLengthAtt != null) { Assert.IsTrue(posLengthAtt.PositionLength >= 1, "posLength must be >= 1"); } } if (ts.IncrementToken()) { Assert.Fail("TokenStream has more tokens than expected (expected count=" + output.Length + "); extra token=" + termAtt); } // repeat our extra safety checks for End() ts.ClearAttributes(); if (termAtt != null) { termAtt.SetEmpty().Append("bogusTerm"); } if (offsetAtt != null) { offsetAtt.SetOffset(14584724, 24683243); } if (typeAtt != null) { typeAtt.Type = "bogusType"; } if (posIncrAtt != null) { posIncrAtt.PositionIncrement = 45987657; } if (posLengthAtt != null) { posLengthAtt.PositionLength = 45987653; } var reset_ = checkClearAtt.AndResetClearCalled; // reset it, because we called clearAttribute() before ts.End(); Assert.IsTrue(checkClearAtt.AndResetClearCalled, "super.End()/ClearAttributes() was not called correctly in End()"); if (finalOffset != null) { Assert.AreEqual((int)finalOffset, offsetAtt.EndOffset(), "finalOffset"); } if (offsetAtt != null) { Assert.IsTrue(offsetAtt.EndOffset() >= 0, "finalOffset must be >= 0"); } if (finalPosInc != null) { Assert.AreEqual((int)finalPosInc, posIncrAtt.PositionIncrement, "finalPosInc"); } ts.Dispose(); }
/* * Need to worry about multiple scenarios: * - need to go for the longest match * a b => foo #shouldn't match if "a b" is followed by "c d" * a b c d => bar * - need to backtrack - retry matches for tokens already read * a b c d => foo * b c => bar * If the input stream is "a b c x", one will consume "a b c d" * trying to match the first rule... all but "a" should be * pushed back so a match may be made on "b c". * - don't try and match generated tokens (thus need separate queue) * matching is not recursive. * - handle optional generation of original tokens in all these cases, * merging token streams to preserve token positions. * - preserve original positionIncrement of first matched token */ public override bool IncrementToken() { while (true) { // if there are any generated tokens, return them... don't try any // matches against them, as we specifically don't want recursion. if (replacement != null && replacement.MoveNext()) { Copy(this, replacement.Current); return(true); } // common case fast-path of first token not matching anything AttributeSource firstTok = NextTok(); if (firstTok == null) { return(false); } var termAtt = firstTok.AddAttribute <ICharTermAttribute>(); SlowSynonymMap result = map.submap != null?map.submap.Get(termAtt.Buffer(), 0, termAtt.Length) : null; if (result == null) { Copy(this, firstTok); return(true); } // fast-path failed, clone ourselves if needed if (firstTok == this) { firstTok = CloneAttributes(); } // OK, we matched a token, so find the longest match. matched = new LinkedList <AttributeSource>(); result = Match(result); if (result == null) { // no match, simply return the first token read. Copy(this, firstTok); return(true); } // reuse, or create new one each time? List <AttributeSource> generated = new List <AttributeSource>(result.synonyms.Length + matched.Count + 1); // // there was a match... let's generate the new tokens, merging // in the matched tokens (position increments need adjusting) // AttributeSource lastTok = matched.Count == 0 ? firstTok : matched.Last.Value; bool includeOrig = result.IncludeOrig; AttributeSource origTok = includeOrig ? firstTok : null; IPositionIncrementAttribute firstPosIncAtt = firstTok.AddAttribute <IPositionIncrementAttribute>(); int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream int repPos = 0; // curr position in replacement token stream int pos = 0; // current position in merged token stream for (int i = 0; i < result.synonyms.Length; i++) { Token repTok = result.synonyms[i]; AttributeSource newTok = firstTok.CloneAttributes(); ICharTermAttribute newTermAtt = newTok.AddAttribute <ICharTermAttribute>(); IOffsetAttribute newOffsetAtt = newTok.AddAttribute <IOffsetAttribute>(); IPositionIncrementAttribute newPosIncAtt = newTok.AddAttribute <IPositionIncrementAttribute>(); IOffsetAttribute lastOffsetAtt = lastTok.AddAttribute <IOffsetAttribute>(); newOffsetAtt.SetOffset(newOffsetAtt.StartOffset(), lastOffsetAtt.EndOffset()); newTermAtt.CopyBuffer(repTok.Buffer(), 0, repTok.Length); repPos += repTok.PositionIncrement; if (i == 0) // make position of first token equal to original { repPos = origPos; } // if necessary, insert original tokens and adjust position increment while (origTok != null && origPos <= repPos) { IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; //origTok = matched.Count == 0 ? null : matched.RemoveFirst(); if (matched.Count == 0) { origTok = null; } else { origTok = matched.First.Value; matched.RemoveFirst(); } if (origTok != null) { origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPos += origPosInc.PositionIncrement; } } newPosIncAtt.PositionIncrement = repPos - pos; generated.Add(newTok); pos += newPosIncAtt.PositionIncrement; } // finish up any leftover original tokens while (origTok != null) { IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; //origTok = matched.Count == 0 ? null : matched.RemoveFirst(); if (matched.Count == 0) { origTok = null; } else { origTok = matched.Count > 0 ? matched.First.Value : null; matched.RemoveFirst(); } if (origTok != null) { origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPos += origPosInc.PositionIncrement; } } // what if we replaced a longer sequence with a shorter one? // a/0 b/5 => foo/0 // should I re-create the gap on the next buffered token? replacement = generated.GetEnumerator(); // Now return to the top of the loop to read and return the first // generated token.. The reason this is done is that we may have generated // nothing at all, and may need to continue with more matching logic. } }
/// <summary> /// Returns the next token in the stream, or null at EOS. /// </summary> public override bool IncrementToken() { while (true) { if (curTermBuffer == null) { if (!input.IncrementToken()) { return(false); } else { curTermBuffer = (char[])termAtt.Buffer().Clone(); curTermLength = termAtt.Length; curCodePointCount = charUtils.CodePointCount(termAtt.ToString()); curGramSize = minGram; curPos = 0; curPosInc = posIncAtt.PositionIncrement; curPosLen = posLenAtt.PositionLength; tokStart = offsetAtt.StartOffset(); tokEnd = offsetAtt.EndOffset(); // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = (tokStart + curTermLength) != tokEnd; } } #pragma warning disable 612, 618 if (version.OnOrAfter(LuceneVersion.LUCENE_44)) #pragma warning restore 612, 618 { if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) { ++curPos; curGramSize = minGram; } if ((curPos + curGramSize) <= curCodePointCount) { ClearAttributes(); int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos); int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); termAtt.CopyBuffer(curTermBuffer, start, end - start); posIncAtt.PositionIncrement = curPosInc; curPosInc = 0; posLenAtt.PositionLength = curPosLen; offsetAtt.SetOffset(tokStart, tokEnd); curGramSize++; return(true); } } else { while (curGramSize <= maxGram) { while (curPos + curGramSize <= curTermLength) // while there is input { ClearAttributes(); termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize); if (hasIllegalOffsets) { offsetAtt.SetOffset(tokStart, tokEnd); } else { offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize); } curPos++; return(true); } curGramSize++; // increase n-gram size curPos = 0; } } curTermBuffer = null; } }
public static void VerifyEquals(Fields d1, Fields d2) { if (d1 == null) { Assert.IsTrue(d2 == null || d2.Size == 0); return; } Assert.IsTrue(d2 != null); IEnumerator <string> fieldsEnum2 = d2.GetEnumerator(); foreach (string field1 in d1) { fieldsEnum2.MoveNext(); string field2 = fieldsEnum2.Current; Assert.AreEqual(field1, field2); Terms terms1 = d1.Terms(field1); Assert.IsNotNull(terms1); TermsEnum termsEnum1 = terms1.Iterator(null); Terms terms2 = d2.Terms(field2); Assert.IsNotNull(terms2); TermsEnum termsEnum2 = terms2.Iterator(null); DocsAndPositionsEnum dpEnum1 = null; DocsAndPositionsEnum dpEnum2 = null; DocsEnum dEnum1 = null; DocsEnum dEnum2 = null; BytesRef term1; while ((term1 = termsEnum1.Next()) != null) { BytesRef term2 = termsEnum2.Next(); Assert.AreEqual(term1, term2); Assert.AreEqual(termsEnum1.TotalTermFreq(), termsEnum2.TotalTermFreq()); dpEnum1 = termsEnum1.DocsAndPositions(null, dpEnum1); dpEnum2 = termsEnum2.DocsAndPositions(null, dpEnum2); if (dpEnum1 != null) { Assert.IsNotNull(dpEnum2); int docID1 = dpEnum1.NextDoc(); dpEnum2.NextDoc(); // docIDs are not supposed to be equal //int docID2 = dpEnum2.NextDoc(); //Assert.AreEqual(docID1, docID2); Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS); int freq1 = dpEnum1.Freq(); int freq2 = dpEnum2.Freq(); Assert.AreEqual(freq1, freq2); IOffsetAttribute offsetAtt1 = dpEnum1.Attributes().HasAttribute <IOffsetAttribute>() ? dpEnum1.Attributes().GetAttribute <IOffsetAttribute>() : null; IOffsetAttribute offsetAtt2 = dpEnum2.Attributes().HasAttribute <IOffsetAttribute>() ? dpEnum2.Attributes().GetAttribute <IOffsetAttribute>() : null; if (offsetAtt1 != null) { Assert.IsNotNull(offsetAtt2); } else { Assert.IsNull(offsetAtt2); } for (int posUpto = 0; posUpto < freq1; posUpto++) { int pos1 = dpEnum1.NextPosition(); int pos2 = dpEnum2.NextPosition(); Assert.AreEqual(pos1, pos2); if (offsetAtt1 != null) { Assert.AreEqual(offsetAtt1.StartOffset(), offsetAtt2.StartOffset()); Assert.AreEqual(offsetAtt1.EndOffset(), offsetAtt2.EndOffset()); } } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum1.NextDoc()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum2.NextDoc()); } else { dEnum1 = TestUtil.Docs(Random(), termsEnum1, null, dEnum1, DocsEnum.FLAG_FREQS); dEnum2 = TestUtil.Docs(Random(), termsEnum2, null, dEnum2, DocsEnum.FLAG_FREQS); Assert.IsNotNull(dEnum1); Assert.IsNotNull(dEnum2); int docID1 = dEnum1.NextDoc(); dEnum2.NextDoc(); // docIDs are not supposed to be equal //int docID2 = dEnum2.NextDoc(); //Assert.AreEqual(docID1, docID2); Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS); int freq1 = dEnum1.Freq(); int freq2 = dEnum2.Freq(); Assert.AreEqual(freq1, freq2); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum1.NextDoc()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum2.NextDoc()); } } Assert.IsNull(termsEnum2.Next()); } Assert.IsFalse(fieldsEnum2.MoveNext()); }
/// <summary> /// Iterates over the given token stream and adds the resulting terms to the index; /// Equivalent to adding a tokenized, indexed, termVectorStored, unstored, /// Lucene <see cref="Documents.Field"/>. /// Finally closes the token stream. Note that untokenized keywords can be added with this method via /// <see cref="KeywordTokenStream{T}(ICollection{T}"/>)"/>, the Lucene <c>KeywordTokenizer</c> or similar utilities. /// /// </summary> /// <param name="fieldName"> a name to be associated with the text </param> /// <param name="stream"> the token stream to retrieve tokens from. </param> /// <param name="boost"> the boost factor for hits for this field </param> /// <param name="positionIncrementGap"> the position increment gap if fields with the same name are added more than once </param> /// <param name="offsetGap"> the offset gap if fields with the same name are added more than once </param> /// <seealso cref="Documents.Field.Boost(float)"/> public virtual void AddField(string fieldName, TokenStream stream, float boost, int positionIncrementGap, int offsetGap) { try { if (fieldName == null) { throw new System.ArgumentException("fieldName must not be null"); } if (stream == null) { throw new System.ArgumentException("token stream must not be null"); } if (boost <= 0.0f) { throw new System.ArgumentException("boost factor must be greater than 0.0"); } int numTokens = 0; int numOverlapTokens = 0; int pos = -1; BytesRefHash terms; SliceByteStartArray sliceArray; Info info = null; long sumTotalTermFreq = 0; int offset = 0; if (fields.TryGetValue(fieldName, out info)) { numTokens = info.numTokens; numOverlapTokens = info.numOverlapTokens; pos = info.lastPosition + positionIncrementGap; offset = info.lastOffset + offsetGap; terms = info.terms; boost *= info.boost; sliceArray = info.sliceArray; sumTotalTermFreq = info.sumTotalTermFreq; } else { sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY); terms = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray); } if (!fieldInfos.ContainsKey(fieldName)) { fieldInfos[fieldName] = new FieldInfo(fieldName, true, fieldInfos.Count, false, false, false, this.storeOffsets ? FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, null, null, null); } ITermToBytesRefAttribute termAtt = stream.GetAttribute <ITermToBytesRefAttribute>(); IPositionIncrementAttribute posIncrAttribute = stream.AddAttribute <IPositionIncrementAttribute>(); IOffsetAttribute offsetAtt = stream.AddAttribute <IOffsetAttribute>(); BytesRef @ref = termAtt.BytesRef; stream.Reset(); while (stream.IncrementToken()) { termAtt.FillBytesRef(); // if (DEBUG) System.err.println("token='" + term + "'"); numTokens++; int posIncr = posIncrAttribute.PositionIncrement; if (posIncr == 0) { numOverlapTokens++; } pos += posIncr; int ord = terms.Add(@ref); if (ord < 0) { ord = (-ord) - 1; postingsWriter.Reset(sliceArray.end[ord]); } else { sliceArray.start[ord] = postingsWriter.StartNewSlice(); } sliceArray.freq[ord]++; sumTotalTermFreq++; if (!storeOffsets) { postingsWriter.WriteInt(pos); } else { postingsWriter.WriteInt(pos); postingsWriter.WriteInt(offsetAtt.StartOffset() + offset); postingsWriter.WriteInt(offsetAtt.EndOffset() + offset); } sliceArray.end[ord] = postingsWriter.CurrentOffset; } stream.End(); // ensure infos.numTokens > 0 invariant; needed for correct operation of terms() if (numTokens > 0) { fields[fieldName] = new Info(terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.EndOffset() + offset, sumTotalTermFreq); sortedFields = null; // invalidate sorted view, if any } } // can never happen catch (Exception /*e*/) { throw; } finally { try { if (stream != null) { stream.Dispose(); } } catch (IOException /*e2*/) { throw; } } }
public override bool IncrementToken() { while (true) { if (curTermBuffer == null) { if (!input.IncrementToken()) { return(false); } else { curTermBuffer = (char[])termAtt.Buffer().Clone(); curTermLength = termAtt.Length; curCodePointCount = charUtils.CodePointCount(termAtt.ToString()); curGramSize = minGram; tokStart = offsetAtt.StartOffset(); tokEnd = offsetAtt.EndOffset(); if (version.OnOrAfter(LuceneVersion.LUCENE_44)) { // Never update offsets updateOffsets = false; } else { // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. updateOffsets = (tokStart + curTermLength) == tokEnd; } savePosIncr += posIncrAtt.PositionIncrement; savePosLen = posLenAtt.PositionLength; } } if (curGramSize <= maxGram) // if we have hit the end of our n-gram size range, quit { if (curGramSize <= curCodePointCount) // if the remaining input is too short, we can't generate any n-grams { // grab gramSize chars from front or back int start = side == Side.FRONT ? 0 : charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize); int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); ClearAttributes(); if (updateOffsets) { offsetAtt.SetOffset(tokStart + start, tokStart + end); } else { offsetAtt.SetOffset(tokStart, tokEnd); } // first ngram gets increment, others don't if (curGramSize == minGram) { posIncrAtt.PositionIncrement = savePosIncr; savePosIncr = 0; } else { posIncrAtt.PositionIncrement = 0; } posLenAtt.PositionLength = savePosLen; termAtt.CopyBuffer(curTermBuffer, start, end - start); curGramSize++; return(true); } } curTermBuffer = null; } }
/* * much of this complexity revolves around handling the special case of a * "lone cjk character" where cjktokenizer would output a unigram. this * is also the only time we ever have to captureState. */ public override bool IncrementToken() { while (true) { if (HasBufferedBigram) { // case 1: we have multiple remaining codepoints buffered, // so we can emit a bigram here. if (outputUnigrams) { // when also outputting unigrams, we output the unigram first, // then rewind back to revisit the bigram. // so an input of ABC is A + (rewind)AB + B + (rewind)BC + C // the logic in hasBufferedUnigram ensures we output the C, // even though it did actually have adjacent CJK characters. if (ngramState) { FlushBigram(); } else { FlushUnigram(); index--; } ngramState = !ngramState; } else { FlushBigram(); } return(true); } else if (DoNext()) { // case 2: look at the token type. should we form any n-grams? string type = typeAtt.Type; if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul) { // acceptable CJK type: we form n-grams from these. // as long as the offsets are aligned, we just add these to our current buffer. // otherwise, we clear the buffer and start over. if (offsetAtt.StartOffset() != lastEndOffset) // unaligned, clear queue { if (HasBufferedUnigram) { // we have a buffered unigram, and we peeked ahead to see if we could form // a bigram, but we can't, because the offsets are unaligned. capture the state // of this peeked data to be revisited next time thru the loop, and dump our unigram. loneState = CaptureState(); FlushUnigram(); return(true); } index = 0; bufferLen = 0; } Refill(); } else { // not a CJK type: we just return these as-is. if (HasBufferedUnigram) { // we have a buffered unigram, and we peeked ahead to see if we could form // a bigram, but we can't, because its not a CJK type. capture the state // of this peeked data to be revisited next time thru the loop, and dump our unigram. loneState = CaptureState(); FlushUnigram(); return(true); } return(true); } } else { // case 3: we have only zero or 1 codepoints buffered, // so not enough to form a bigram. But, we also have no // more input. So if we have a buffered codepoint, emit // a unigram, otherwise, its end of stream. if (HasBufferedUnigram) { FlushUnigram(); // flush our remaining unigram return(true); } return(false); } } }
// we only check a few core attributes here. // TODO: test other things public virtual void assertEquals(string s, TokenStream left, TokenStream right) { left.Reset(); right.Reset(); ICharTermAttribute leftTerm = left.AddAttribute <ICharTermAttribute>(); ICharTermAttribute rightTerm = right.AddAttribute <ICharTermAttribute>(); IOffsetAttribute leftOffset = left.AddAttribute <IOffsetAttribute>(); IOffsetAttribute rightOffset = right.AddAttribute <IOffsetAttribute>(); IPositionIncrementAttribute leftPos = left.AddAttribute <IPositionIncrementAttribute>(); IPositionIncrementAttribute rightPos = right.AddAttribute <IPositionIncrementAttribute>(); while (left.IncrementToken()) { assertTrue("wrong number of tokens for input: " + s, right.IncrementToken()); assertEquals("wrong term text for input: " + s, leftTerm.ToString(), rightTerm.ToString()); assertEquals("wrong position for input: " + s, leftPos.PositionIncrement, rightPos.PositionIncrement); assertEquals("wrong start offset for input: " + s, leftOffset.StartOffset(), rightOffset.StartOffset()); assertEquals("wrong end offset for input: " + s, leftOffset.EndOffset(), rightOffset.EndOffset()); } ; assertFalse("wrong number of tokens for input: " + s, right.IncrementToken()); left.End(); right.End(); assertEquals("wrong final offset for input: " + s, leftOffset.EndOffset(), rightOffset.EndOffset()); left.Dispose(); right.Dispose(); }
public virtual void ToDot() { @in.Reset(); WriteHeader(); // TODO: is there some way to tell dot that it should // make the "main path" a straight line and have the // non-sausage arcs not affect node placement... int pos = -1; int lastEndPos = -1; while (@in.IncrementToken()) { bool isFirst = pos == -1; int posInc = PosIncAtt.PositionIncrement; if (isFirst && posInc == 0) { // TODO: hmm are TS's still allowed to do this...? Console.Error.WriteLine("WARNING: first posInc was 0; correcting to 1"); posInc = 1; } if (posInc > 0) { // New node: pos += posInc; WriteNode(pos, Convert.ToString(pos)); } if (posInc > 1) { // Gap! WriteArc(lastEndPos, pos, null, "dotted"); } if (isFirst) { WriteNode(-1, null); WriteArc(-1, pos, null, null); } string arcLabel = TermAtt.ToString(); if (OffsetAtt != null) { int startOffset = OffsetAtt.StartOffset(); int endOffset = OffsetAtt.EndOffset(); //System.out.println("start=" + startOffset + " end=" + endOffset + " len=" + inputText.length()); if (InputText != null) { arcLabel += " / " + InputText.Substring(startOffset, endOffset - startOffset); } else { arcLabel += " / " + startOffset + "-" + endOffset; } } WriteArc(pos, pos + PosLengthAtt.PositionLength, arcLabel, null); lastEndPos = pos + PosLengthAtt.PositionLength; } @in.End(); if (lastEndPos != -1) { // TODO: should we output any final text (from end // offsets) on this arc...? WriteNode(-2, null); WriteArc(lastEndPos, -2, null, null); } WriteTrailer(); }
public override bool IncrementToken() { if (hasMoreTokensInClone) { int start = breaker.Current(); int end = breaker.Next(); if (end != BreakIterator.DONE) { clonedToken.CopyTo(this); termAtt.CopyBuffer(clonedTermAtt.Buffer(), start, end - start); if (hasIllegalOffsets) { offsetAtt.SetOffset(clonedOffsetAtt.StartOffset(), clonedOffsetAtt.EndOffset()); } else { offsetAtt.SetOffset(clonedOffsetAtt.StartOffset() + start, clonedOffsetAtt.StartOffset() + end); } if (handlePosIncr) { posAtt.PositionIncrement = 1; } return(true); } hasMoreTokensInClone = false; } if (!input.IncrementToken()) { return(false); } if (termAtt.Length == 0 || !Regex.IsMatch(termAtt.ToString().Substring(0, 1), @"\p{IsThai}")) { return(true); } hasMoreTokensInClone = true; // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = offsetAtt.EndOffset() - offsetAtt.StartOffset() != termAtt.Length; // we lazy init the cloned token, as in ctor not all attributes may be added if (clonedToken == null) { clonedToken = CloneAttributes(); clonedTermAtt = clonedToken.GetAttribute <ICharTermAttribute>(); clonedOffsetAtt = clonedToken.GetAttribute <IOffsetAttribute>(); } else { this.CopyTo(clonedToken); } // reinit CharacterIterator charIterator.SetText(clonedTermAtt.Buffer(), 0, clonedTermAtt.Length); breaker.SetText(new string(charIterator.Text, charIterator.Start, charIterator.Length)); int end2 = breaker.Next(); if (end2 != BreakIterator.DONE) { termAtt.Length = end2; if (hasIllegalOffsets) { offsetAtt.SetOffset(clonedOffsetAtt.StartOffset(), clonedOffsetAtt.EndOffset()); } else { offsetAtt.SetOffset(clonedOffsetAtt.StartOffset(), clonedOffsetAtt.StartOffset() + end2); } // position increment keeps as it is for first token return(true); } return(false); }