// we only check a few core attributes here. // TODO: test other things public virtual void assertEquals(string s, TokenStream left, TokenStream right) { left.Reset(); right.Reset(); ICharTermAttribute leftTerm = left.AddAttribute <ICharTermAttribute>(); ICharTermAttribute rightTerm = right.AddAttribute <ICharTermAttribute>(); IOffsetAttribute leftOffset = left.AddAttribute <IOffsetAttribute>(); IOffsetAttribute rightOffset = right.AddAttribute <IOffsetAttribute>(); IPositionIncrementAttribute leftPos = left.AddAttribute <IPositionIncrementAttribute>(); IPositionIncrementAttribute rightPos = right.AddAttribute <IPositionIncrementAttribute>(); while (left.IncrementToken()) { assertTrue("wrong number of tokens for input: " + s, right.IncrementToken()); assertEquals("wrong term text for input: " + s, leftTerm.ToString(), rightTerm.ToString()); assertEquals("wrong position for input: " + s, leftPos.PositionIncrement, rightPos.PositionIncrement); assertEquals("wrong start offset for input: " + s, leftOffset.StartOffset(), rightOffset.StartOffset()); assertEquals("wrong end offset for input: " + s, leftOffset.EndOffset(), rightOffset.EndOffset()); } ; assertFalse("wrong number of tokens for input: " + s, right.IncrementToken()); left.End(); right.End(); assertEquals("wrong final offset for input: " + s, leftOffset.EndOffset(), rightOffset.EndOffset()); left.Dispose(); right.Dispose(); }
public override bool IncrementToken() { if (endState != null) { return(false); } if (!input.IncrementToken()) { return(false); } int skippedPositions = 0; while (true) { if (stopWords.Contains(termAtt.Buffer(), 0, termAtt.Length)) { int posInc = posIncAtt.PositionIncrement; int endOffset = offsetAtt.EndOffset(); // This token may be a stopword, if it's not end: State sav = CaptureState(); if (input.IncrementToken()) { // It was a stopword; skip it skippedPositions += posInc; } else { ClearAttributes(); input.End(); endState = CaptureState(); int finalEndOffset = offsetAtt.EndOffset(); Debug.Assert(finalEndOffset >= endOffset); if (finalEndOffset > endOffset) { // OK there was a token separator after the // stopword, so it was a stopword return(false); } else { // No token separator after final token that // looked like a stop-word; don't filter it: RestoreState(sav); posIncAtt.PositionIncrement = skippedPositions + posIncAtt.PositionIncrement; keywordAtt.Keyword = true; return(true); } } } else { // Not a stopword; return the current token: posIncAtt.PositionIncrement = skippedPositions + posIncAtt.PositionIncrement; return(true); } } }
protected internal override object Highlight(string text, IEnumerable <string> matchedTokens, string prefixToken) { TokenStream ts = queryAnalyzer.TokenStream("text", new StringReader(text)); try { ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>(); IOffsetAttribute offsetAtt = ts.AddAttribute <IOffsetAttribute>(); ts.Reset(); List <LookupHighlightFragment> fragments = new List <LookupHighlightFragment>(); int upto = 0; while (ts.IncrementToken()) { string token = termAtt.toString(); int startOffset = offsetAtt.StartOffset(); int endOffset = offsetAtt.EndOffset(); if (upto < startOffset) { fragments.Add(new LookupHighlightFragment(text.Substring(upto, startOffset - upto), false)); upto = startOffset; } else if (upto > startOffset) { continue; } if (matchedTokens.Contains(token)) { // Token matches. fragments.Add(new LookupHighlightFragment(text.Substring(startOffset, endOffset - startOffset), true)); upto = endOffset; } else if (prefixToken != null && token.StartsWith(prefixToken, StringComparison.InvariantCulture)) { fragments.Add(new LookupHighlightFragment(text.Substring(startOffset, prefixToken.Length), true)); if (prefixToken.Length < token.Length) { fragments.Add(new LookupHighlightFragment(text.Substring(startOffset + prefixToken.Length, (startOffset + token.Length) - (startOffset + prefixToken.Length)), false)); } upto = endOffset; } } ts.End(); int endOffset2 = offsetAtt.EndOffset(); if (upto < endOffset2) { fragments.Add(new LookupHighlightFragment(text.Substring(upto), false)); } return(fragments); } finally { IOUtils.CloseWhileHandlingException(ts); } }
public override bool IncrementToken() { if (!input.IncrementToken()) { return(false); } char[] termBuffer = termAtt.Buffer(); int len = termAtt.Length; //TODO: Is this the right behavior or should we return false? Currently, " ", returns true, so I think this should //also return true if (len == 0) { return(true); } int start = 0; int end = 0; int endOff = 0; // eat the first characters for (start = 0; start < len && char.IsWhiteSpace(termBuffer[start]); start++) { } // eat the end characters for (end = len; end >= start && char.IsWhiteSpace(termBuffer[end - 1]); end--) { endOff++; } if (start > 0 || end < len) { if (start < end) { termAtt.CopyBuffer(termBuffer, start, (end - start)); } else { termAtt.SetEmpty(); } if (updateOffsets && len == offsetAtt.EndOffset() - offsetAtt.StartOffset()) { int newStart = offsetAtt.StartOffset() + start; int newEnd = offsetAtt.EndOffset() - (start < end ? endOff : 0); offsetAtt.SetOffset(newStart, newEnd); } } return(true); }
public virtual void TestSupplementaryCharacters() { string s = TestUtil.RandomUnicodeString(Random(), 10); int codePointCount = Character.CodePointCount(s, 0, s.Length); int minGram = TestUtil.NextInt(Random(), 1, 3); int maxGram = TestUtil.NextInt(Random(), minGram, 10); TokenStream tk = new KeywordTokenizer(new StringReader(s)); tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram); ICharTermAttribute termAtt = tk.AddAttribute <ICharTermAttribute>(); IOffsetAttribute offsetAtt = tk.AddAttribute <IOffsetAttribute>(); tk.Reset(); for (int start = 0; start < codePointCount; ++start) { for (int end = start + minGram; end <= Math.Min(codePointCount, start + maxGram); ++end) { assertTrue(tk.IncrementToken()); assertEquals(0, offsetAtt.StartOffset()); assertEquals(s.Length, offsetAtt.EndOffset()); int startIndex = Character.OffsetByCodePoints(s, 0, start); int endIndex = Character.OffsetByCodePoints(s, 0, end); assertEquals(s.Substring(startIndex, endIndex - startIndex), termAtt.ToString()); } } assertFalse(tk.IncrementToken()); }
public override bool IncrementToken() { if (inPhrase) { inPhrase = false; termAtt.SetEmpty().Append("phrase2"); offsetAtt.SetOffset(savedStart, savedEnd); return(true); } else { while (input.IncrementToken()) { if (termAtt.toString().equals("phrase")) { inPhrase = true; savedStart = offsetAtt.StartOffset(); savedEnd = offsetAtt.EndOffset(); termAtt.SetEmpty().Append("phrase1"); offsetAtt.SetOffset(savedStart, savedEnd); return(true); } else if (!termAtt.toString().equals("stop")) { return(true); } } } return(false); }
internal static void TestNGrams(int minGram, int maxGram, string s, string nonTokenChars, bool edgesOnly) { // convert the string to code points int[] codePoints = toCodePoints(s); int[] offsets = new int[codePoints.Length + 1]; for (int i = 0; i < codePoints.Length; ++i) { offsets[i + 1] = offsets[i] + Character.CharCount(codePoints[i]); } TokenStream grams = new NGramTokenizerAnonymousInnerClassHelper(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram, edgesOnly, nonTokenChars); ICharTermAttribute termAtt = grams.AddAttribute <ICharTermAttribute>(); IPositionIncrementAttribute posIncAtt = grams.AddAttribute <IPositionIncrementAttribute>(); IPositionLengthAttribute posLenAtt = grams.AddAttribute <IPositionLengthAttribute>(); IOffsetAttribute offsetAtt = grams.AddAttribute <IOffsetAttribute>(); grams.Reset(); for (int start = 0; start < codePoints.Length; ++start) { for (int end = start + minGram; end <= start + maxGram && end <= codePoints.Length; ++end) { if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1])) { // not on an edge goto nextGramContinue; } for (int j = start; j < end; ++j) { if (!isTokenChar(nonTokenChars, codePoints[j])) { goto nextGramContinue; } } assertTrue(grams.IncrementToken()); assertArrayEquals(Arrays.CopyOfRange(codePoints, start, end), toCodePoints(termAtt.ToString())); assertEquals(1, posIncAtt.PositionIncrement); assertEquals(1, posLenAtt.PositionLength); assertEquals(offsets[start], offsetAtt.StartOffset()); assertEquals(offsets[end], offsetAtt.EndOffset()); nextGramContinue :; } //nextGramBreak:; } assertFalse(grams.IncrementToken()); grams.End(); assertEquals(s.Length, offsetAtt.StartOffset()); assertEquals(s.Length, offsetAtt.EndOffset()); }
internal void WriteOffsets(int termID, int offsetAccum) { Debug.Assert(HasOffsets); int startOffset = offsetAccum + OffsetAttribute.StartOffset(); int endOffset = offsetAccum + OffsetAttribute.EndOffset(); FreqProxPostingsArray postings = (FreqProxPostingsArray)TermsHashPerField.PostingsArray; Debug.Assert(startOffset - postings.LastOffsets[termID] >= 0); TermsHashPerField.WriteVInt(1, startOffset - postings.LastOffsets[termID]); TermsHashPerField.WriteVInt(1, endOffset - startOffset); postings.LastOffsets[termID] = startOffset; }
private Token GetNextPrefixInputToken(Token token) { if (!prefix.IncrementToken()) { return(null); } token.CopyBuffer(p_termAtt.Buffer(), 0, p_termAtt.Length); token.PositionIncrement = p_posIncrAtt.PositionIncrement; token.Flags = p_flagsAtt.Flags; token.SetOffset(p_offsetAtt.StartOffset(), p_offsetAtt.EndOffset()); token.Type = p_typeAtt.Type; token.Payload = p_payloadAtt.Payload; return(token); }
/// <summary> /// refills buffers with new data from the current token. /// </summary> private void Refill() { // compact buffers to keep them smallish if they become large // just a safety check, but technically we only need the last codepoint if (bufferLen > 64) { int last = bufferLen - 1; buffer[0] = buffer[last]; startOffset[0] = startOffset[last]; endOffset[0] = endOffset[last]; bufferLen = 1; index -= last; } char[] termBuffer = termAtt.Buffer(); int len = termAtt.Length; int start = offsetAtt.StartOffset(); int end = offsetAtt.EndOffset(); int newSize = bufferLen + len; buffer = ArrayUtil.Grow(buffer, newSize); startOffset = ArrayUtil.Grow(startOffset, newSize); endOffset = ArrayUtil.Grow(endOffset, newSize); lastEndOffset = end; if (end - start != len) { // crazy offsets (modified by synonym or charfilter): just preserve for (int i = 0, cp = 0; i < len; i += Character.CharCount(cp)) { cp = buffer[bufferLen] = Character.CodePointAt(termBuffer, i, len); startOffset[bufferLen] = start; endOffset[bufferLen] = end; bufferLen++; } } else { // normal offsets for (int i = 0, cp = 0, cpLen = 0; i < len; i += cpLen) { cp = buffer[bufferLen] = Character.CodePointAt(termBuffer, i, len); cpLen = Character.CharCount(cp); startOffset[bufferLen] = start; start = endOffset[bufferLen] = start + cpLen; bufferLen++; } } }
private Token GetNextSuffixInputToken(Token token) { if (!suffix.IncrementToken()) { return(null); } token.CopyBuffer(termAtt.Buffer(), 0, termAtt.Length); token.PositionIncrement = posIncrAtt.PositionIncrement; token.Flags = flagsAtt.Flags; token.SetOffset(offsetAtt.StartOffset(), offsetAtt.EndOffset()); token.Type = typeAtt.Type; token.Payload = payloadAtt.Payload; return(token); }
public virtual void Test() { string test = "The quick red fox jumped over the lazy brown dogs"; TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false)); int count = 0; IPayloadAttribute payloadAtt = nptf.GetAttribute <IPayloadAttribute>(); IOffsetAttribute offsetAtt = nptf.GetAttribute <IOffsetAttribute>(); nptf.Reset(); while (nptf.IncrementToken()) { BytesRef pay = payloadAtt.Payload; assertTrue("pay is null and it shouldn't be", pay != null); byte[] data = pay.Bytes; int start = PayloadHelper.DecodeInt(data, 0); assertTrue(start + " does not equal: " + offsetAtt.StartOffset(), start == offsetAtt.StartOffset()); int end = PayloadHelper.DecodeInt(data, 4); assertTrue(end + " does not equal: " + offsetAtt.EndOffset(), end == offsetAtt.EndOffset()); count++; } assertTrue(count + " does not equal: " + 10, count == 10); }
public override sealed bool IncrementToken() { if (input.IncrementToken()) { byte[] data = new byte[8]; PayloadHelper.EncodeInt(offsetAtt.StartOffset(), data, 0); PayloadHelper.EncodeInt(offsetAtt.EndOffset(), data, 4); BytesRef payload = new BytesRef(data); payAtt.Payload = payload; return(true); } else { return(false); } }
/// <summary> /// {@inheritDoc} /// </summary> public override bool IncrementToken() { while (!exhausted && input.IncrementToken()) { char[] term = termAttribute.Buffer(); int termLength = termAttribute.Length; lastEndOffset = offsetAttribute.EndOffset(); if (termLength > 0 && term[termLength - 1] == '-') { // a hyphenated word // capture the state of the first token only if (savedState == null) { savedState = CaptureState(); } hyphenated.Append(term, 0, termLength - 1); } else if (savedState == null) { // not part of a hyphenated word. return(true); } else { // the final portion of a hyphenated word hyphenated.Append(term, 0, termLength); Unhyphenate(); return(true); } } exhausted = true; if (savedState != null) { // the final term ends with a hyphen // add back the hyphen, for backwards compatibility. hyphenated.Append('-'); Unhyphenate(); return(true); } return(false); }
public virtual void TestOffsets() { TokenStream stream = (new KeywordAnalyzer()).TokenStream("field", new StringReader("abcd")); try { IOffsetAttribute offsetAtt = stream.AddAttribute <IOffsetAttribute>(); stream.Reset(); assertTrue(stream.IncrementToken()); assertEquals(0, offsetAtt.StartOffset()); assertEquals(4, offsetAtt.EndOffset()); assertFalse(stream.IncrementToken()); stream.End(); } finally { IOUtils.CloseWhileHandlingException(stream); } }
public virtual void TestOtherLetterOffset() { string s = "a天b"; ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s)); int correctStartOffset = 0; int correctEndOffset = 1; IOffsetAttribute offsetAtt = tokenizer.GetAttribute <IOffsetAttribute>(); tokenizer.Reset(); while (tokenizer.IncrementToken()) { assertEquals(correctStartOffset, offsetAtt.StartOffset()); assertEquals(correctEndOffset, offsetAtt.EndOffset()); correctStartOffset++; correctEndOffset++; } tokenizer.End(); tokenizer.Dispose(); }
public virtual void TestFilterTokens() { SnowballFilter filter = new SnowballFilter(new TestTokenStream(this), "English"); ICharTermAttribute termAtt = filter.GetAttribute <ICharTermAttribute>(); IOffsetAttribute offsetAtt = filter.GetAttribute <IOffsetAttribute>(); ITypeAttribute typeAtt = filter.GetAttribute <ITypeAttribute>(); IPayloadAttribute payloadAtt = filter.GetAttribute <IPayloadAttribute>(); IPositionIncrementAttribute posIncAtt = filter.GetAttribute <IPositionIncrementAttribute>(); IFlagsAttribute flagsAtt = filter.GetAttribute <IFlagsAttribute>(); filter.IncrementToken(); assertEquals("accent", termAtt.ToString()); assertEquals(2, offsetAtt.StartOffset()); assertEquals(7, offsetAtt.EndOffset()); assertEquals("wrd", typeAtt.Type); assertEquals(3, posIncAtt.PositionIncrement); assertEquals(77, flagsAtt.Flags); assertEquals(new BytesRef(new byte[] { 0, 1, 2, 3 }), payloadAtt.Payload); }
public override sealed bool IncrementToken() { if (multiToken > 0) { termAtt.SetEmpty().Append("multi" + (multiToken + 1)); offsetAtt.SetOffset(prevStartOffset, prevEndOffset); typeAtt.Type = (prevType); posIncrAtt.PositionIncrement = (0); multiToken--; return(true); } else { bool next = input.IncrementToken(); if (!next) { return(false); } prevType = typeAtt.Type; prevStartOffset = offsetAtt.StartOffset(); prevEndOffset = offsetAtt.EndOffset(); string text = termAtt.toString(); if (text.equals("triplemulti")) { multiToken = 2; return(true); } else if (text.equals("multi")) { multiToken = 1; return(true); } else { return(true); } } }
/// <summary> /// Constructs a compound token. /// </summary> private void GramToken() { buffer.Append(termAttribute.Buffer(), 0, termAttribute.Length); int endOffset = offsetAttribute.EndOffset(); ClearAttributes(); var length = buffer.Length; var termText = termAttribute.Buffer(); if (length > termText.Length) { termText = termAttribute.ResizeBuffer(length); } buffer.GetChars(0, length, termText, 0); termAttribute.Length = length; posIncAttribute.PositionIncrement = 0; posLenAttribute.PositionLength = 2; // bigram offsetAttribute.SetOffset(lastStartOffset, endOffset); typeAttribute.Type = GRAM_TYPE; buffer.Length = 0; }
public override void ProcessFields(IndexableField[] fields, int count) { FieldState.Reset(); bool doInvert = Consumer.Start(fields, count); for (int i = 0; i < count; i++) { IndexableField field = fields[i]; IndexableFieldType fieldType = field.FieldType(); // TODO FI: this should be "genericized" to querying // consumer if it wants to see this particular field // tokenized. if (fieldType.Indexed && doInvert) { bool analyzed = fieldType.Tokenized && DocState.Analyzer != null; // if the field omits norms, the boost cannot be indexed. if (fieldType.OmitNorms && field.GetBoost() != 1.0f) { throw new System.NotSupportedException("You cannot set an index-time boost: norms are omitted for field '" + field.Name() + "'"); } // only bother checking offsets if something will consume them. // TODO: after we fix analyzers, also check if termVectorOffsets will be indexed. bool checkOffsets = fieldType.IndexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; int lastStartOffset = 0; if (i > 0) { FieldState.Position_Renamed += analyzed ? DocState.Analyzer.GetPositionIncrementGap(fieldInfo.Name) : 0; } /* * To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream * when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses, * but rather a finally that takes note of the problem. */ bool succeededInProcessingField = false; TokenStream stream = field.GetTokenStream(DocState.Analyzer); // reset the TokenStream to the first token stream.Reset(); try { bool hasMoreTokens = stream.IncrementToken(); FieldState.AttributeSource_Renamed = stream; IOffsetAttribute offsetAttribute = FieldState.AttributeSource_Renamed.AddAttribute <IOffsetAttribute>(); IPositionIncrementAttribute posIncrAttribute = FieldState.AttributeSource_Renamed.AddAttribute <IPositionIncrementAttribute>(); if (hasMoreTokens) { Consumer.Start(field); do { // If we hit an exception in stream.next below // (which is fairly common, eg if analyzer // chokes on a given document), then it's // non-aborting and (above) this one document // will be marked as deleted, but still // consume a docID int posIncr = posIncrAttribute.PositionIncrement; if (posIncr < 0) { throw new System.ArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.Name() + "'"); } if (FieldState.Position_Renamed == 0 && posIncr == 0) { throw new System.ArgumentException("first position increment must be > 0 (got 0) for field '" + field.Name() + "'"); } int position = FieldState.Position_Renamed + posIncr; if (position > 0) { // NOTE: confusing: this "mirrors" the // position++ we do below position--; } else if (position < 0) { throw new System.ArgumentException("position overflow for field '" + field.Name() + "'"); } // position is legal, we can safely place it in fieldState now. // not sure if anything will use fieldState after non-aborting exc... FieldState.Position_Renamed = position; if (posIncr == 0) { FieldState.NumOverlap_Renamed++; } if (checkOffsets) { int startOffset = FieldState.Offset_Renamed + offsetAttribute.StartOffset(); int endOffset = FieldState.Offset_Renamed + offsetAttribute.EndOffset(); if (startOffset < 0 || endOffset < startOffset) { throw new System.ArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, " + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.Name() + "'"); } if (startOffset < lastStartOffset) { throw new System.ArgumentException("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.Name() + "'"); } lastStartOffset = startOffset; } bool success = false; try { // If we hit an exception in here, we abort // all buffered documents since the last // flush, on the likelihood that the // internal state of the consumer is now // corrupt and should not be flushed to a // new segment: Consumer.Add(); success = true; } finally { if (!success) { DocState.DocWriter.SetAborting(); } } FieldState.Length_Renamed++; FieldState.Position_Renamed++; } while (stream.IncrementToken()); } // trigger streams to perform end-of-stream operations stream.End(); // TODO: maybe add some safety? then again, its already checked // when we come back around to the field... FieldState.Position_Renamed += posIncrAttribute.PositionIncrement; FieldState.Offset_Renamed += offsetAttribute.EndOffset(); if (DocState.MaxTermPrefix != null) { string msg = "Document contains at least one immense term in field=\"" + fieldInfo.Name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + DocState.MaxTermPrefix + "...'"; if (DocState.InfoStream.IsEnabled("IW")) { DocState.InfoStream.Message("IW", "ERROR: " + msg); } DocState.MaxTermPrefix = null; throw new System.ArgumentException(msg); } /* if success was false above there is an exception coming through and we won't get here.*/ succeededInProcessingField = true; } finally { if (!succeededInProcessingField) { IOUtils.CloseWhileHandlingException(stream); } else { stream.Dispose(); } if (!succeededInProcessingField && DocState.InfoStream.IsEnabled("DW")) { DocState.InfoStream.Message("DW", "An exception was thrown while processing field " + fieldInfo.Name); } } FieldState.Offset_Renamed += analyzed ? DocState.Analyzer.GetOffsetGap(fieldInfo.Name) : 0; FieldState.Boost_Renamed *= field.GetBoost(); } // LUCENE-2387: don't hang onto the field, so GC can // reclaim fields[i] = null; } Consumer.Finish(); EndConsumer.Finish(); }
public virtual void ToDot() { @in.Reset(); WriteHeader(); // TODO: is there some way to tell dot that it should // make the "main path" a straight line and have the // non-sausage arcs not affect node placement... int pos = -1; int lastEndPos = -1; while (@in.IncrementToken()) { bool isFirst = pos == -1; int posInc = PosIncAtt.PositionIncrement; if (isFirst && posInc == 0) { // TODO: hmm are TS's still allowed to do this...? Console.Error.WriteLine("WARNING: first posInc was 0; correcting to 1"); posInc = 1; } if (posInc > 0) { // New node: pos += posInc; WriteNode(pos, Convert.ToString(pos)); } if (posInc > 1) { // Gap! WriteArc(lastEndPos, pos, null, "dotted"); } if (isFirst) { WriteNode(-1, null); WriteArc(-1, pos, null, null); } string arcLabel = TermAtt.ToString(); if (OffsetAtt != null) { int startOffset = OffsetAtt.StartOffset(); int endOffset = OffsetAtt.EndOffset(); //System.out.println("start=" + startOffset + " end=" + endOffset + " len=" + inputText.length()); if (InputText != null) { arcLabel += " / " + InputText.Substring(startOffset, endOffset - startOffset); } else { arcLabel += " / " + startOffset + "-" + endOffset; } } WriteArc(pos, pos + PosLengthAtt.PositionLength, arcLabel, null); lastEndPos = pos + PosLengthAtt.PositionLength; } @in.End(); if (lastEndPos != -1) { // TODO: should we output any final text (from end // offsets) on this arc...? WriteNode(-2, null); WriteArc(lastEndPos, -2, null, null); } WriteTrailer(); }
// For the output string: separate positions with a space, // and separate multiple tokens at each position with a // /. If a token should have end offset != the input // token's end offset then add :X to it: // TODO: we should probably refactor this guy to use/take analyzer, // the tests are a little messy private void Verify(string input, string output) { if (VERBOSE) { Console.WriteLine("TEST: verify input=" + input + " expectedOutput=" + output); } tokensIn.Reader = new StringReader(input); tokensOut.Reset(); string[] expected = output.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries); int expectedUpto = 0; while (tokensOut.IncrementToken()) { if (VERBOSE) { Console.WriteLine(" incr token=" + termAtt.ToString() + " posIncr=" + posIncrAtt.PositionIncrement + " startOff=" + offsetAtt.StartOffset() + " endOff=" + offsetAtt.EndOffset()); } assertTrue(expectedUpto < expected.Length); int startOffset = offsetAtt.StartOffset(); int endOffset = offsetAtt.EndOffset(); string[] expectedAtPos = expected[expectedUpto++].Split(new string[] { "/" }, StringSplitOptions.RemoveEmptyEntries); for (int atPos = 0; atPos < expectedAtPos.Length; atPos++) { if (atPos > 0) { assertTrue(tokensOut.IncrementToken()); if (VERBOSE) { Console.WriteLine(" incr token=" + termAtt.ToString() + " posIncr=" + posIncrAtt.PositionIncrement + " startOff=" + offsetAtt.StartOffset() + " endOff=" + offsetAtt.EndOffset()); } } int colonIndex = expectedAtPos[atPos].IndexOf(':'); int underbarIndex = expectedAtPos[atPos].IndexOf('_'); string expectedToken; int expectedEndOffset; int expectedPosLen; if (colonIndex != -1) { expectedToken = expectedAtPos[atPos].Substring(0, colonIndex - 0); if (underbarIndex != -1) { expectedEndOffset = int.Parse(expectedAtPos[atPos].Substring(1 + colonIndex, underbarIndex - (1 + colonIndex))); expectedPosLen = int.Parse(expectedAtPos[atPos].Substring(1 + underbarIndex)); } else { expectedEndOffset = int.Parse(expectedAtPos[atPos].Substring(1 + colonIndex)); expectedPosLen = 1; } } else { expectedToken = expectedAtPos[atPos]; expectedEndOffset = endOffset; expectedPosLen = 1; } assertEquals(expectedToken, termAtt.ToString()); assertEquals(atPos == 0 ? 1 : 0, posIncrAtt.PositionIncrement); // start/end offset of all tokens at same pos should // be the same: assertEquals(startOffset, offsetAtt.StartOffset()); assertEquals(expectedEndOffset, offsetAtt.EndOffset()); assertEquals(expectedPosLen, posLenAtt.PositionLength); } } tokensOut.End(); tokensOut.Dispose(); if (VERBOSE) { Console.WriteLine(" incr: END"); } assertEquals(expectedUpto, expected.Length); }
/// <summary> /// Returns the next token in the stream, or null at EOS. /// </summary> public override bool IncrementToken() { while (true) { if (curTermBuffer == null) { if (!input.IncrementToken()) { return(false); } else { curTermBuffer = (char[])termAtt.Buffer().Clone(); curTermLength = termAtt.Length; curCodePointCount = charUtils.CodePointCount(termAtt.ToString()); curGramSize = minGram; curPos = 0; curPosInc = posIncAtt.PositionIncrement; curPosLen = posLenAtt.PositionLength; tokStart = offsetAtt.StartOffset(); tokEnd = offsetAtt.EndOffset(); // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = (tokStart + curTermLength) != tokEnd; } } if (version.OnOrAfter(LuceneVersion.LUCENE_44)) { if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) { ++curPos; curGramSize = minGram; } if ((curPos + curGramSize) <= curCodePointCount) { ClearAttributes(); int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos); int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); termAtt.CopyBuffer(curTermBuffer, start, end - start); posIncAtt.PositionIncrement = curPosInc; curPosInc = 0; posLenAtt.PositionLength = curPosLen; offsetAtt.SetOffset(tokStart, tokEnd); curGramSize++; return(true); } } else { while (curGramSize <= maxGram) { while (curPos + curGramSize <= curTermLength) // while there is input { ClearAttributes(); termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize); if (hasIllegalOffsets) { offsetAtt.SetOffset(tokStart, tokEnd); } else { offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize); } curPos++; return(true); } curGramSize++; // increase n-gram size curPos = 0; } } curTermBuffer = null; } }
/// <summary> /// Retrieve suggestions. /// </summary> public virtual List <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num) { if (contexts != null) { throw new System.ArgumentException("this suggester doesn't support contexts"); } TokenStream ts = queryAnalyzer.TokenStream("", key.ToString()); try { ITermToBytesRefAttribute termBytesAtt = ts.AddAttribute <ITermToBytesRefAttribute>(); IOffsetAttribute offsetAtt = ts.AddAttribute <IOffsetAttribute>(); IPositionLengthAttribute posLenAtt = ts.AddAttribute <IPositionLengthAttribute>(); IPositionIncrementAttribute posIncAtt = ts.AddAttribute <IPositionIncrementAttribute>(); ts.Reset(); var lastTokens = new BytesRef[grams]; //System.out.println("lookup: key='" + key + "'"); // Run full analysis, but save only the // last 1gram, last 2gram, etc.: BytesRef tokenBytes = termBytesAtt.BytesRef; int maxEndOffset = -1; bool sawRealToken = false; while (ts.IncrementToken()) { termBytesAtt.FillBytesRef(); sawRealToken |= tokenBytes.Length > 0; // TODO: this is somewhat iffy; today, ShingleFilter // sets posLen to the gram count; maybe we should make // a separate dedicated att for this? int gramCount = posLenAtt.PositionLength; Debug.Assert(gramCount <= grams); // Safety: make sure the recalculated count "agrees": if (CountGrams(tokenBytes) != gramCount) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes)); } maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset()); lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes); } ts.End(); if (!sawRealToken) { throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings"); } // Carefully fill last tokens with _ tokens; // ShingleFilter appraently won't emit "only hole" // tokens: int endPosInc = posIncAtt.PositionIncrement; // Note this will also be true if input is the empty // string (in which case we saw no tokens and // maxEndOffset is still -1), which in fact works out OK // because we fill the unigram with an empty BytesRef // below: bool lastTokenEnded = offsetAtt.EndOffset() > maxEndOffset || endPosInc > 0; //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset()); if (lastTokenEnded) { //System.out.println(" lastTokenEnded"); // If user hit space after the last token, then // "upgrade" all tokens. This way "foo " will suggest // all bigrams starting w/ foo, and not any unigrams // starting with "foo": for (int i = grams - 1; i > 0; i--) { BytesRef token = lastTokens[i - 1]; if (token == null) { continue; } token.Grow(token.Length + 1); token.Bytes[token.Length] = separator; token.Length++; lastTokens[i] = token; } lastTokens[0] = new BytesRef(); } var arc = new FST.Arc <long?>(); var bytesReader = fst.BytesReader; // Try highest order models first, and if they return // results, return that; else, fallback: double backoff = 1.0; List <LookupResult> results = new List <LookupResult>(num); // We only add a given suffix once, from the highest // order model that saw it; for subsequent lower order // models we skip it: var seen = new HashSet <BytesRef>(); for (int gram = grams - 1; gram >= 0; gram--) { BytesRef token = lastTokens[gram]; // Don't make unigram predictions from empty string: if (token == null || (token.Length == 0 && key.Length > 0)) { // Input didn't have enough tokens: //System.out.println(" gram=" + gram + ": skip: not enough input"); continue; } if (endPosInc > 0 && gram <= endPosInc) { // Skip hole-only predictions; in theory we // shouldn't have to do this, but we'd need to fix // ShingleFilter to produce only-hole tokens: //System.out.println(" break: only holes now"); break; } //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString()); // TODO: we could add fuzziness here // match the prefix portion exactly //Pair<Long,BytesRef> prefixOutput = null; long?prefixOutput = null; prefixOutput = LookupPrefix(fst, bytesReader, token, arc); //System.out.println(" prefixOutput=" + prefixOutput); if (prefixOutput == null) { // This model never saw this prefix, e.g. the // trigram model never saw context "purple mushroom" backoff *= ALPHA; continue; } // TODO: we could do this division at build time, and // bake it into the FST? // Denominator for computing scores from current // model's predictions: long contextCount = totTokens; BytesRef lastTokenFragment = null; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { BytesRef context = new BytesRef(token.Bytes, token.Offset, i); long? output = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToIntsRef(context, new IntsRef())); Debug.Assert(output != null); contextCount = DecodeWeight(output); lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } BytesRef finalLastToken; if (lastTokenFragment == null) { finalLastToken = BytesRef.DeepCopyOf(token); } else { finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment); } Debug.Assert(finalLastToken.Offset == 0); CharsRef spare = new CharsRef(); // complete top-N Util.Fst.Util.TopResults <long?> completions = null; try { // Because we store multiple models in one FST // (1gram, 2gram, 3gram), we must restrict the // search so that it only considers the current // model. For highest order model, this is not // necessary since all completions in the FST // must be from this model, but for lower order // models we have to filter out the higher order // ones: // Must do num+seen.size() for queue depth because we may // reject up to seen.size() paths in acceptResult(): Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparator, seen, finalLastToken); // since this search is initialized with a single start node // it is okay to start with an empty input path here searcher.AddStartPaths(arc, prefixOutput, true, new IntsRef()); completions = searcher.Search(); Debug.Assert(completions.IsComplete); } catch (IOException bogus) { throw new Exception(bogus.Message, bogus); } int prefixLength = token.Length; BytesRef suffix = new BytesRef(8); //System.out.println(" " + completions.length + " completions"); foreach (Util.Fst.Util.Result <long?> completion in completions) { token.Length = prefixLength; // append suffix Util.Fst.Util.ToBytesRef(completion.Input, suffix); token.Append(suffix); //System.out.println(" completion " + token.utf8ToString()); // Skip this path if a higher-order model already // saw/predicted its last token: BytesRef lastToken = token; for (int i = token.Length - 1; i >= 0; i--) { if (token.Bytes[token.Offset + i] == separator) { Debug.Assert(token.Length - i - 1 > 0); lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1); break; } } if (seen.Contains(lastToken)) { //System.out.println(" skip dup " + lastToken.utf8ToString()); goto nextCompletionContinue; } seen.Add(BytesRef.DeepCopyOf(lastToken)); spare.Grow(token.Length); UnicodeUtil.UTF8toUTF16(token, spare); LookupResult result = new LookupResult(spare.ToString(), // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes // return numbers that are greater than long.MaxValue, which results in a negative long number. (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount)); results.Add(result); Debug.Assert(results.Count == seen.Count); //System.out.println(" add result=" + result); nextCompletionContinue :; } backoff *= ALPHA; } results.Sort(new ComparatorAnonymousInnerClassHelper(this)); if (results.Count > num) { results.SubList(num, results.Count).Clear(); } return(results); } finally { IOUtils.CloseWhileHandlingException(ts); } }
public override bool IncrementToken() { while (true) { if (curTermBuffer == null) { if (!input.IncrementToken()) { return(false); } else { curTermBuffer = (char[])termAtt.Buffer().Clone(); curTermLength = termAtt.Length; curCodePointCount = charUtils.CodePointCount(termAtt.ToString()); curGramSize = minGram; tokStart = offsetAtt.StartOffset(); tokEnd = offsetAtt.EndOffset(); if (version.OnOrAfter(LuceneVersion.LUCENE_44)) { // Never update offsets updateOffsets = false; } else { // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. updateOffsets = (tokStart + curTermLength) == tokEnd; } savePosIncr += posIncrAtt.PositionIncrement; savePosLen = posLenAtt.PositionLength; } } if (curGramSize <= maxGram) // if we have hit the end of our n-gram size range, quit { if (curGramSize <= curCodePointCount) // if the remaining input is too short, we can't generate any n-grams { // grab gramSize chars from front or back int start = side == Side.FRONT ? 0 : charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize); int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); ClearAttributes(); if (updateOffsets) { offsetAtt.SetOffset(tokStart + start, tokStart + end); } else { offsetAtt.SetOffset(tokStart, tokEnd); } // first ngram gets increment, others don't if (curGramSize == minGram) { posIncrAtt.PositionIncrement = savePosIncr; savePosIncr = 0; } else { posIncrAtt.PositionIncrement = 0; } posLenAtt.PositionLength = savePosLen; termAtt.CopyBuffer(curTermBuffer, start, end - start); curGramSize++; return(true); } } curTermBuffer = null; } }
private void Parse() { //System.out.println("\nS: parse"); Debug.Assert(inputSkipCount == 0); int curNextRead = nextRead; // Holds the longest match we've seen so far: BytesRef matchOutput = null; int matchInputLength = 0; int matchEndOffset = -1; BytesRef pendingOutput = fst.Outputs.NoOutput; fst.GetFirstArc(scratchArc); Debug.Assert(scratchArc.Output == fst.Outputs.NoOutput); int tokenCount = 0; while (true) { // Pull next token's chars: char[] buffer; int bufferLen; //System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite); int inputEndOffset = 0; if (curNextRead == nextWrite) { // We used up our lookahead buffer of input tokens // -- pull next real input token: if (finished) { break; } else { //System.out.println(" input.incrToken"); Debug.Assert(futureInputs[nextWrite].consumed); // Not correct: a syn match whose output is longer // than its input can set future inputs keepOrig // to true: //assert !futureInputs[nextWrite].keepOrig; if (input.IncrementToken()) { buffer = termAtt.Buffer(); bufferLen = termAtt.Length; PendingInput pendingInput = futureInputs[nextWrite]; lastStartOffset = pendingInput.startOffset = offsetAtt.StartOffset(); lastEndOffset = pendingInput.endOffset = offsetAtt.EndOffset(); inputEndOffset = pendingInput.endOffset; //System.out.println(" new token=" + new String(buffer, 0, bufferLen)); if (nextRead != nextWrite) { Capture(); } else { pendingInput.consumed = false; } } else { // No more input tokens //System.out.println(" set end"); finished = true; break; } } } else { // Still in our lookahead buffer = futureInputs[curNextRead].term.Chars; bufferLen = futureInputs[curNextRead].term.Length; inputEndOffset = futureInputs[curNextRead].endOffset; //System.out.println(" old token=" + new String(buffer, 0, bufferLen)); } tokenCount++; // Run each char in this token through the FST: int bufUpto = 0; while (bufUpto < bufferLen) { int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen); if (fst.FindTargetArc(ignoreCase ? Character.ToLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { //System.out.println(" stop"); goto byTokenBreak; } // Accum the output pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output); //System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output); bufUpto += Character.CharCount(codePoint); } // OK, entire token matched; now see if this is a final // state: if (scratchArc.Final) { matchOutput = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput); matchInputLength = tokenCount; matchEndOffset = inputEndOffset; //System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput); } // See if the FST wants to continue matching (ie, needs to // see the next input token): if (fst.FindTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) { // No further rules can match here; we're done // searching for matching rules starting at the // current input position. break; } else { // More matching is possible -- accum the output (if // any) of the WORD_SEP arc: pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output); if (nextRead == nextWrite) { Capture(); } } curNextRead = RollIncr(curNextRead); } byTokenBreak: if (nextRead == nextWrite && !finished) { //System.out.println(" skip write slot=" + nextWrite); nextWrite = RollIncr(nextWrite); } if (matchOutput != null) { //System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput); inputSkipCount = matchInputLength; AddOutput(matchOutput, matchInputLength, matchEndOffset); } else if (nextRead != nextWrite) { // Even though we had no match here, we set to 1 // because we need to skip current input token before // trying to match again: inputSkipCount = 1; } else { Debug.Assert(finished); } //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite); }
public static void VerifyEquals(Fields d1, Fields d2) { if (d1 == null) { Assert.IsTrue(d2 == null || d2.Size == 0); return; } Assert.IsTrue(d2 != null); IEnumerator <string> fieldsEnum2 = d2.GetEnumerator(); foreach (string field1 in d1) { fieldsEnum2.MoveNext(); string field2 = fieldsEnum2.Current; Assert.AreEqual(field1, field2); Terms terms1 = d1.Terms(field1); Assert.IsNotNull(terms1); TermsEnum termsEnum1 = terms1.Iterator(null); Terms terms2 = d2.Terms(field2); Assert.IsNotNull(terms2); TermsEnum termsEnum2 = terms2.Iterator(null); DocsAndPositionsEnum dpEnum1 = null; DocsAndPositionsEnum dpEnum2 = null; DocsEnum dEnum1 = null; DocsEnum dEnum2 = null; BytesRef term1; while ((term1 = termsEnum1.Next()) != null) { BytesRef term2 = termsEnum2.Next(); Assert.AreEqual(term1, term2); Assert.AreEqual(termsEnum1.TotalTermFreq(), termsEnum2.TotalTermFreq()); dpEnum1 = termsEnum1.DocsAndPositions(null, dpEnum1); dpEnum2 = termsEnum2.DocsAndPositions(null, dpEnum2); if (dpEnum1 != null) { Assert.IsNotNull(dpEnum2); int docID1 = dpEnum1.NextDoc(); dpEnum2.NextDoc(); // docIDs are not supposed to be equal //int docID2 = dpEnum2.NextDoc(); //Assert.AreEqual(docID1, docID2); Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS); int freq1 = dpEnum1.Freq(); int freq2 = dpEnum2.Freq(); Assert.AreEqual(freq1, freq2); IOffsetAttribute offsetAtt1 = dpEnum1.Attributes().HasAttribute <IOffsetAttribute>() ? dpEnum1.Attributes().GetAttribute <IOffsetAttribute>() : null; IOffsetAttribute offsetAtt2 = dpEnum2.Attributes().HasAttribute <IOffsetAttribute>() ? dpEnum2.Attributes().GetAttribute <IOffsetAttribute>() : null; if (offsetAtt1 != null) { Assert.IsNotNull(offsetAtt2); } else { Assert.IsNull(offsetAtt2); } for (int posUpto = 0; posUpto < freq1; posUpto++) { int pos1 = dpEnum1.NextPosition(); int pos2 = dpEnum2.NextPosition(); Assert.AreEqual(pos1, pos2); if (offsetAtt1 != null) { Assert.AreEqual(offsetAtt1.StartOffset(), offsetAtt2.StartOffset()); Assert.AreEqual(offsetAtt1.EndOffset(), offsetAtt2.EndOffset()); } } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum1.NextDoc()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum2.NextDoc()); } else { dEnum1 = TestUtil.Docs(Random(), termsEnum1, null, dEnum1, DocsEnum.FLAG_FREQS); dEnum2 = TestUtil.Docs(Random(), termsEnum2, null, dEnum2, DocsEnum.FLAG_FREQS); Assert.IsNotNull(dEnum1); Assert.IsNotNull(dEnum2); int docID1 = dEnum1.NextDoc(); dEnum2.NextDoc(); // docIDs are not supposed to be equal //int docID2 = dEnum2.NextDoc(); //Assert.AreEqual(docID1, docID2); Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS); int freq1 = dEnum1.Freq(); int freq2 = dEnum2.Freq(); Assert.AreEqual(freq1, freq2); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum1.NextDoc()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum2.NextDoc()); } } Assert.IsNull(termsEnum2.Next()); } Assert.IsFalse(fieldsEnum2.MoveNext()); }
/// <summary> /// Iterates over the given token stream and adds the resulting terms to the index; /// Equivalent to adding a tokenized, indexed, termVectorStored, unstored, /// Lucene <see cref="Documents.Field"/>. /// Finally closes the token stream. Note that untokenized keywords can be added with this method via /// <see cref="KeywordTokenStream{T}(ICollection{T}"/>)"/>, the Lucene <c>KeywordTokenizer</c> or similar utilities. /// /// </summary> /// <param name="fieldName"> a name to be associated with the text </param> /// <param name="stream"> the token stream to retrieve tokens from. </param> /// <param name="boost"> the boost factor for hits for this field </param> /// <param name="positionIncrementGap"> the position increment gap if fields with the same name are added more than once </param> /// <param name="offsetGap"> the offset gap if fields with the same name are added more than once </param> /// <seealso cref="Documents.Field.Boost(float)"/> public virtual void AddField(string fieldName, TokenStream stream, float boost, int positionIncrementGap, int offsetGap) { try { if (fieldName == null) { throw new System.ArgumentException("fieldName must not be null"); } if (stream == null) { throw new System.ArgumentException("token stream must not be null"); } if (boost <= 0.0f) { throw new System.ArgumentException("boost factor must be greater than 0.0"); } int numTokens = 0; int numOverlapTokens = 0; int pos = -1; BytesRefHash terms; SliceByteStartArray sliceArray; Info info = null; long sumTotalTermFreq = 0; int offset = 0; if (fields.TryGetValue(fieldName, out info)) { numTokens = info.numTokens; numOverlapTokens = info.numOverlapTokens; pos = info.lastPosition + positionIncrementGap; offset = info.lastOffset + offsetGap; terms = info.terms; boost *= info.boost; sliceArray = info.sliceArray; sumTotalTermFreq = info.sumTotalTermFreq; } else { sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY); terms = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray); } if (!fieldInfos.ContainsKey(fieldName)) { fieldInfos[fieldName] = new FieldInfo(fieldName, true, fieldInfos.Count, false, false, false, this.storeOffsets ? FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, null, null, null); } ITermToBytesRefAttribute termAtt = stream.GetAttribute <ITermToBytesRefAttribute>(); IPositionIncrementAttribute posIncrAttribute = stream.AddAttribute <IPositionIncrementAttribute>(); IOffsetAttribute offsetAtt = stream.AddAttribute <IOffsetAttribute>(); BytesRef @ref = termAtt.BytesRef; stream.Reset(); while (stream.IncrementToken()) { termAtt.FillBytesRef(); // if (DEBUG) System.err.println("token='" + term + "'"); numTokens++; int posIncr = posIncrAttribute.PositionIncrement; if (posIncr == 0) { numOverlapTokens++; } pos += posIncr; int ord = terms.Add(@ref); if (ord < 0) { ord = (-ord) - 1; postingsWriter.Reset(sliceArray.end[ord]); } else { sliceArray.start[ord] = postingsWriter.StartNewSlice(); } sliceArray.freq[ord]++; sumTotalTermFreq++; if (!storeOffsets) { postingsWriter.WriteInt(pos); } else { postingsWriter.WriteInt(pos); postingsWriter.WriteInt(offsetAtt.StartOffset() + offset); postingsWriter.WriteInt(offsetAtt.EndOffset() + offset); } sliceArray.end[ord] = postingsWriter.CurrentOffset; } stream.End(); // ensure infos.numTokens > 0 invariant; needed for correct operation of terms() if (numTokens > 0) { fields[fieldName] = new Info(terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.EndOffset() + offset, sumTotalTermFreq); sortedFields = null; // invalidate sorted view, if any } } // can never happen catch (Exception /*e*/) { throw; } finally { try { if (stream != null) { stream.Dispose(); } } catch (IOException /*e2*/) { throw; } } }
/* * Need to worry about multiple scenarios: * - need to go for the longest match * a b => foo #shouldn't match if "a b" is followed by "c d" * a b c d => bar * - need to backtrack - retry matches for tokens already read * a b c d => foo * b c => bar * If the input stream is "a b c x", one will consume "a b c d" * trying to match the first rule... all but "a" should be * pushed back so a match may be made on "b c". * - don't try and match generated tokens (thus need separate queue) * matching is not recursive. * - handle optional generation of original tokens in all these cases, * merging token streams to preserve token positions. * - preserve original positionIncrement of first matched token */ public override bool IncrementToken() { while (true) { // if there are any generated tokens, return them... don't try any // matches against them, as we specifically don't want recursion. if (replacement != null && replacement.MoveNext()) { Copy(this, replacement.Current); return(true); } // common case fast-path of first token not matching anything AttributeSource firstTok = NextTok(); if (firstTok == null) { return(false); } var termAtt = firstTok.AddAttribute <ICharTermAttribute>(); SlowSynonymMap result = map.submap != null?map.submap.Get(termAtt.Buffer(), 0, termAtt.Length) : null; if (result == null) { Copy(this, firstTok); return(true); } // fast-path failed, clone ourselves if needed if (firstTok == this) { firstTok = CloneAttributes(); } // OK, we matched a token, so find the longest match. matched = new LinkedList <AttributeSource>(); result = Match(result); if (result == null) { // no match, simply return the first token read. Copy(this, firstTok); return(true); } // reuse, or create new one each time? List <AttributeSource> generated = new List <AttributeSource>(result.synonyms.Length + matched.Count + 1); // // there was a match... let's generate the new tokens, merging // in the matched tokens (position increments need adjusting) // AttributeSource lastTok = matched.Count == 0 ? firstTok : matched.Last.Value; bool includeOrig = result.IncludeOrig; AttributeSource origTok = includeOrig ? firstTok : null; IPositionIncrementAttribute firstPosIncAtt = firstTok.AddAttribute <IPositionIncrementAttribute>(); int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream int repPos = 0; // curr position in replacement token stream int pos = 0; // current position in merged token stream for (int i = 0; i < result.synonyms.Length; i++) { Token repTok = result.synonyms[i]; AttributeSource newTok = firstTok.CloneAttributes(); ICharTermAttribute newTermAtt = newTok.AddAttribute <ICharTermAttribute>(); IOffsetAttribute newOffsetAtt = newTok.AddAttribute <IOffsetAttribute>(); IPositionIncrementAttribute newPosIncAtt = newTok.AddAttribute <IPositionIncrementAttribute>(); IOffsetAttribute lastOffsetAtt = lastTok.AddAttribute <IOffsetAttribute>(); newOffsetAtt.SetOffset(newOffsetAtt.StartOffset(), lastOffsetAtt.EndOffset()); newTermAtt.CopyBuffer(repTok.Buffer(), 0, repTok.Length); repPos += repTok.PositionIncrement; if (i == 0) // make position of first token equal to original { repPos = origPos; } // if necessary, insert original tokens and adjust position increment while (origTok != null && origPos <= repPos) { IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; //origTok = matched.Count == 0 ? null : matched.RemoveFirst(); if (matched.Count == 0) { origTok = null; } else { origTok = matched.First.Value; matched.RemoveFirst(); } if (origTok != null) { origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPos += origPosInc.PositionIncrement; } } newPosIncAtt.PositionIncrement = repPos - pos; generated.Add(newTok); pos += newPosIncAtt.PositionIncrement; } // finish up any leftover original tokens while (origTok != null) { IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPosInc.PositionIncrement = origPos - pos; generated.Add(origTok); pos += origPosInc.PositionIncrement; //origTok = matched.Count == 0 ? null : matched.RemoveFirst(); if (matched.Count == 0) { origTok = null; } else { origTok = matched.Count > 0 ? matched.First.Value : null; matched.RemoveFirst(); } if (origTok != null) { origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>(); origPos += origPosInc.PositionIncrement; } } // what if we replaced a longer sequence with a shorter one? // a/0 b/5 => foo/0 // should I re-create the gap on the next buffered token? replacement = generated.GetEnumerator(); // Now return to the top of the loop to read and return the first // generated token.. The reason this is done is that we may have generated // nothing at all, and may need to continue with more matching logic. } }
// offsetsAreCorrect also validates: // - graph offsets are correct (all tokens leaving from // pos X have the same startOffset; all tokens // arriving to pos Y have the same endOffset) // - offsets only move forwards (startOffset >= // lastStartOffset) public static void AssertTokenStreamContents(TokenStream ts, string[] output, int[] startOffsets, int[] endOffsets, string[] types, int[] posIncrements, int[] posLengths, int?finalOffset, int?finalPosInc, bool[] keywordAtts, bool offsetsAreCorrect) { Assert.IsNotNull(output); var checkClearAtt = ts.AddAttribute <ICheckClearAttributesAttribute>(); ICharTermAttribute termAtt = null; if (output.Length > 0) { Assert.IsTrue(ts.HasAttribute <ICharTermAttribute>(), "has no CharTermAttribute"); termAtt = ts.GetAttribute <ICharTermAttribute>(); } IOffsetAttribute offsetAtt = null; if (startOffsets != null || endOffsets != null || finalOffset != null) { Assert.IsTrue(ts.HasAttribute <IOffsetAttribute>(), "has no OffsetAttribute"); offsetAtt = ts.GetAttribute <IOffsetAttribute>(); } ITypeAttribute typeAtt = null; if (types != null) { Assert.IsTrue(ts.HasAttribute <ITypeAttribute>(), "has no TypeAttribute"); typeAtt = ts.GetAttribute <ITypeAttribute>(); } IPositionIncrementAttribute posIncrAtt = null; if (posIncrements != null || finalPosInc != null) { Assert.IsTrue(ts.HasAttribute <IPositionIncrementAttribute>(), "has no PositionIncrementAttribute"); posIncrAtt = ts.GetAttribute <IPositionIncrementAttribute>(); } IPositionLengthAttribute posLengthAtt = null; if (posLengths != null) { Assert.IsTrue(ts.HasAttribute <IPositionLengthAttribute>(), "has no PositionLengthAttribute"); posLengthAtt = ts.GetAttribute <IPositionLengthAttribute>(); } IKeywordAttribute keywordAtt = null; if (keywordAtts != null) { Assert.IsTrue(ts.HasAttribute <IKeywordAttribute>(), "has no KeywordAttribute"); keywordAtt = ts.GetAttribute <IKeywordAttribute>(); } // Maps position to the start/end offset: IDictionary <int?, int?> posToStartOffset = new Dictionary <int?, int?>(); IDictionary <int?, int?> posToEndOffset = new Dictionary <int?, int?>(); ts.Reset(); int pos = -1; int lastStartOffset = 0; for (int i = 0; i < output.Length; i++) { // extra safety to enforce, that the state is not preserved and also assign bogus values ts.ClearAttributes(); termAtt.SetEmpty().Append("bogusTerm"); if (offsetAtt != null) { offsetAtt.SetOffset(14584724, 24683243); } if (typeAtt != null) { typeAtt.Type = "bogusType"; } if (posIncrAtt != null) { posIncrAtt.PositionIncrement = 45987657; } if (posLengthAtt != null) { posLengthAtt.PositionLength = 45987653; } if (keywordAtt != null) { keywordAtt.Keyword = (i & 1) == 0; } bool reset = checkClearAtt.AndResetClearCalled; // reset it, because we called clearAttribute() before Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist"); Assert.IsTrue(reset, "ClearAttributes() was not called correctly in TokenStream chain"); Assert.AreEqual(output[i], termAtt.ToString(), "term " + i + ", output[i] = " + output[i] + ", termAtt = " + termAtt.ToString()); if (startOffsets != null) { Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i); } if (endOffsets != null) { Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i); } if (types != null) { Assert.AreEqual(types[i], typeAtt.Type, "type " + i); } if (posIncrements != null) { Assert.AreEqual(posIncrements[i], posIncrAtt.PositionIncrement, "posIncrement " + i); } if (posLengths != null) { Assert.AreEqual(posLengths[i], posLengthAtt.PositionLength, "posLength " + i); } if (keywordAtts != null) { Assert.AreEqual(keywordAtts[i], keywordAtt.Keyword, "keywordAtt " + i); } // we can enforce some basic things about a few attributes even if the caller doesn't check: if (offsetAtt != null) { int startOffset = offsetAtt.StartOffset(); int endOffset = offsetAtt.EndOffset(); if (finalOffset != null) { Assert.IsTrue(startOffset <= (int)finalOffset, "startOffset must be <= finalOffset"); Assert.IsTrue(endOffset <= (int)finalOffset, "endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + (int)finalOffset); } if (offsetsAreCorrect) { Assert.IsTrue(offsetAtt.StartOffset() >= lastStartOffset, "offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset); lastStartOffset = offsetAtt.StartOffset(); } if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) { // Validate offset consistency in the graph, ie // all tokens leaving from a certain pos have the // same startOffset, and all tokens arriving to a // certain pos have the same endOffset: int posInc = posIncrAtt.PositionIncrement; pos += posInc; int posLength = posLengthAtt.PositionLength; if (!posToStartOffset.ContainsKey(pos)) { // First time we've seen a token leaving from this position: posToStartOffset[pos] = startOffset; //System.out.println(" + s " + pos + " -> " + startOffset); } else { // We've seen a token leaving from this position // before; verify the startOffset is the same: //System.out.println(" + vs " + pos + " -> " + startOffset); Assert.AreEqual((int)posToStartOffset[pos], startOffset, "pos=" + pos + " posLen=" + posLength + " token=" + termAtt); } int endPos = pos + posLength; if (!posToEndOffset.ContainsKey(endPos)) { // First time we've seen a token arriving to this position: posToEndOffset[endPos] = endOffset; //System.out.println(" + e " + endPos + " -> " + endOffset); } else { // We've seen a token arriving to this position // before; verify the endOffset is the same: //System.out.println(" + ve " + endPos + " -> " + endOffset); Assert.AreEqual((int)posToEndOffset[endPos], endOffset, "pos=" + pos + " posLen=" + posLength + " token=" + termAtt); } } } if (posIncrAtt != null) { if (i == 0) { Assert.IsTrue(posIncrAtt.PositionIncrement >= 1, "first posIncrement must be >= 1"); } else { Assert.IsTrue(posIncrAtt.PositionIncrement >= 0, "posIncrement must be >= 0"); } } if (posLengthAtt != null) { Assert.IsTrue(posLengthAtt.PositionLength >= 1, "posLength must be >= 1"); } } if (ts.IncrementToken()) { Assert.Fail("TokenStream has more tokens than expected (expected count=" + output.Length + "); extra token=" + termAtt); } // repeat our extra safety checks for End() ts.ClearAttributes(); if (termAtt != null) { termAtt.SetEmpty().Append("bogusTerm"); } if (offsetAtt != null) { offsetAtt.SetOffset(14584724, 24683243); } if (typeAtt != null) { typeAtt.Type = "bogusType"; } if (posIncrAtt != null) { posIncrAtt.PositionIncrement = 45987657; } if (posLengthAtt != null) { posLengthAtt.PositionLength = 45987653; } var reset_ = checkClearAtt.AndResetClearCalled; // reset it, because we called clearAttribute() before ts.End(); Assert.IsTrue(checkClearAtt.AndResetClearCalled, "super.End()/ClearAttributes() was not called correctly in End()"); if (finalOffset != null) { Assert.AreEqual((int)finalOffset, offsetAtt.EndOffset(), "finalOffset"); } if (offsetAtt != null) { Assert.IsTrue(offsetAtt.EndOffset() >= 0, "finalOffset must be >= 0"); } if (finalPosInc != null) { Assert.AreEqual((int)finalPosInc, posIncrAtt.PositionIncrement, "finalPosInc"); } ts.Dispose(); }