public override bool IncrementToken() { if (!input.IncrementToken()) { return(false); } char[] termBuffer = termAtt.Buffer(); int termBufferLength = termAtt.Length; char[] backup = null; if (maxWordCount < DEFAULT_MAX_WORD_COUNT) { //make a backup in case we exceed the word count backup = new char[termBufferLength]; Array.Copy(termBuffer, 0, backup, 0, termBufferLength); } if (termBufferLength < maxTokenLength) { int wordCount = 0; int lastWordStart = 0; for (int i = 0; i < termBufferLength; i++) { char c = termBuffer[i]; if (c <= ' ' || c == '.') { int len = i - lastWordStart; if (len > 0) { ProcessWord(termBuffer, lastWordStart, len, wordCount++); lastWordStart = i + 1; i++; } } } // process the last word if (lastWordStart < termBufferLength) { ProcessWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++); } if (wordCount > maxWordCount) { termAtt.CopyBuffer(backup, 0, termBufferLength); } } return(true); }
public override bool IncrementToken() { while (true) { if (curTermBuffer == null) { if (!input.IncrementToken()) { return(false); } else { curTermBuffer = termAtt.Buffer().Clone(); curTermLength = termAtt.Length(); curCodePointCount = charUtils.codePointCount(termAtt); curGramSize = minGram; tokStart = offsetAtt.StartOffset(); tokEnd = offsetAtt.EndOffset(); if (version.OnOrAfter(Version.LUCENE_44)) { // Never update offsets updateOffsets = false; } else { // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. updateOffsets = (tokStart + curTermLength) == tokEnd; } savePosIncr += posIncrAtt.PositionIncrement; savePosLen = posLenAtt.PositionLength; } } if (curGramSize <= maxGram) // if we have hit the end of our n-gram size range, quit { if (curGramSize <= curCodePointCount) // if the remaining input is too short, we can't generate any n-grams { // grab gramSize chars from front or back int start = side == Side.FRONT ? 0 : charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize); int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); ClearAttributes(); if (updateOffsets) { offsetAtt.SetOffset(tokStart + start, tokStart + end); } else { offsetAtt.SetOffset(tokStart, tokEnd); } // first ngram gets increment, others don't if (curGramSize == minGram) { posIncrAtt.PositionIncrement = savePosIncr; savePosIncr = 0; } else { posIncrAtt.PositionIncrement = 0; } posLenAtt.PositionLength = savePosLen; termAtt.CopyBuffer(curTermBuffer, start, end - start); curGramSize++; return(true); } } curTermBuffer = null; } }
/// <summary> /// Returns the next token in the stream, or null at EOS. /// </summary> public override bool IncrementToken() { while (true) { if (curTermBuffer == null) { if (!input.IncrementToken()) { return(false); } else { curTermBuffer = termAtt.Buffer().Clone(); curTermLength = termAtt.Length; curCodePointCount = charUtils.CodePointCount(termAtt); curGramSize = minGram; curPos = 0; curPosInc = posIncAtt.PositionIncrement; curPosLen = posLenAtt.PositionLength; tokStart = offsetAtt.StartOffset(); tokEnd = offsetAtt.EndOffset(); // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = (tokStart + curTermLength) != tokEnd; } } if (version.OnOrAfter(Version.LUCENE_44)) { if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) { ++curPos; curGramSize = minGram; } if ((curPos + curGramSize) <= curCodePointCount) { ClearAttributes(); int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos); int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); termAtt.CopyBuffer(curTermBuffer, start, end - start); posIncAtt.PositionIncrement = curPosInc; curPosInc = 0; posLenAtt.PositionLength = curPosLen; offsetAtt.SetOffset(tokStart, tokEnd); curGramSize++; return(true); } } else { while (curGramSize <= maxGram) { while (curPos + curGramSize <= curTermLength) // while there is input { ClearAttributes(); termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize); if (hasIllegalOffsets) { offsetAtt.SetOffset(tokStart, tokEnd); } else { offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize); } curPos++; return(true); } curGramSize++; // increase n-gram size curPos = 0; } } curTermBuffer = null; } }