/// <summary> /// Saves this information to form the left part of a gram /// </summary> private void SaveTermBuffer() { buffer.Length = 0; buffer.Append(termAttribute.Buffer(), 0, termAttribute.Length); buffer.Append(SEPARATOR); lastStartOffset = offsetAttribute.StartOffset(); lastWasCommon = Common; }
/// <summary> /// {@inheritDoc} /// </summary> public override bool IncrementToken() { while (input.IncrementToken()) { char[] term = termAttribute.Buffer(); int length = termAttribute.Length; int posIncrement = posIncAttribute.PositionIncrement; if (posIncrement > 0) { previous.Clear(); } bool duplicate = (posIncrement == 0 && previous.Contains(term, 0, length)); // clone the term, and add to the set of seen terms. char[] saved = new char[length]; Array.Copy(term, 0, saved, 0, length); previous.Add(saved); if (!duplicate) { return(true); } } return(false); }
public override bool IncrementToken() { if (!done) { ClearAttributes(); done = true; int upto = 0; char[] buffer = termAtt.Buffer(); while (true) { int length = input.Read(buffer, upto, buffer.Length - upto); if (length == -1) { break; } upto += length; if (upto == buffer.Length) { buffer = termAtt.ResizeBuffer(1 + buffer.Length); } } termAtt.Length = upto; finalOffset = CorrectOffset(upto); offsetAtt.SetOffset(CorrectOffset(0), finalOffset); return(true); } return(false); }
/// <summary> /// Increments the <seealso cref="TokenStream"/> with a <seealso cref="CharTermAttribute"/> without elisioned start /// </summary> public override bool IncrementToken() { if (input.IncrementToken()) { char[] termBuffer = termAtt.Buffer(); int termLength = termAtt.Length; int index = -1; for (int i = 0; i < termLength; i++) { char ch = termBuffer[i]; if (ch == '\'' || ch == '\u2019') { index = i; break; } } // An apostrophe has been found. If the prefix is an article strip it off. if (index >= 0 && articles.Contains(termBuffer, 0, index)) { termAtt.CopyBuffer(termBuffer, index + 1, termLength - (index + 1)); } return(true); } else { return(false); } }
public override bool IncrementToken() { if (endState != null) { return(false); } if (!Input.IncrementToken()) { return(false); } int skippedPositions = 0; while (true) { if (stopWords.Contains(termAtt.Buffer(), 0, termAtt.Length)) { int posInc = posIncAtt.PositionIncrement; int endOffset = offsetAtt.EndOffset(); // This token may be a stopword, if it's not end: State sav = CaptureState(); if (Input.IncrementToken()) { // It was a stopword; skip it skippedPositions += posInc; } else { ClearAttributes(); Input.End(); endState = CaptureState(); int finalEndOffset = offsetAtt.EndOffset(); Debug.Assert(finalEndOffset >= endOffset); if (finalEndOffset > endOffset) { // OK there was a token separator after the // stopword, so it was a stopword return(false); } else { // No token separator after final token that // looked like a stop-word; don't filter it: RestoreState(sav); posIncAtt.PositionIncrement = skippedPositions + posIncAtt.PositionIncrement; keywordAtt.Keyword = true; return(true); } } } else { // Not a stopword; return the current token: posIncAtt.PositionIncrement = skippedPositions + posIncAtt.PositionIncrement; return(true); } } }
private void AssertEquals(CharTermAttribute term, string expected) { assertEquals(expected.Length, term.Length); char[] buffer = term.Buffer(); for (int chIDX = 0; chIDX < expected.Length; chIDX++) { assertEquals(expected[chIDX], buffer[chIDX]); } }
public override bool IncrementToken() { ClearAttributes(); // termination of this loop is guaranteed by the fact that every iteration // either advances the buffer (calls consumes()) or increases gramSize while (true) { // compact if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted) { Array.Copy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart); bufferEnd -= bufferStart; lastCheckedChar -= bufferStart; lastNonTokenChar -= bufferStart; bufferStart = 0; // fill in remaining space exhausted = !charUtils.Fill(charBuffer, input, buffer.Length - bufferEnd); // convert to code points bufferEnd += charUtils.toCodePoints(charBuffer.Buffer, 0, charBuffer.Length, buffer, bufferEnd); } // should we go to the next offset? if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd) { if (bufferStart + 1 + minGram > bufferEnd) { Debug.Assert(exhausted); return(false); } consume(); gramSize = minGram; } updateLastNonTokenChar(); // retry if the token to be emitted was going to not only contain token chars bool termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize); bool isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1; if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar) { consume(); gramSize = minGram; continue; } int length = charUtils.toChars(buffer, bufferStart, gramSize, termAtt.Buffer(), 0); termAtt.Length = length; posIncAtt.PositionIncrement = 1; posLenAtt.PositionLength = 1; offsetAtt.SetOffset(CorrectOffset(offset), CorrectOffset(offset + length)); ++gramSize; return(true); } }
public override bool IncrementToken() { if (input.IncrementToken()) { int len = termAtt.Length; if (marker != NOMARKER) { len++; termAtt.ResizeBuffer(len); termAtt.Buffer()[len - 1] = marker; } reverse(matchVersion, termAtt.Buffer(), 0, len); termAtt.Length = len; return(true); } else { return(false); } }
public override bool IncrementToken() { if (Index <= 'z') { ClearAttributes(); TermAtt.Length = 1; TermAtt.Buffer()[0] = (char)Index++; return(true); } return(false); }
public override bool IncrementToken() { if (input.incrementToken()) { charUtils.ToLower(termAtt.Buffer(), 0, termAtt.Length); return(true); } else { return(false); } }
public override bool IncrementToken() { if (!input.IncrementToken()) { return(false); } char[] termBuffer = termAtt.Buffer(); int termBufferLength = termAtt.Length; char[] backup = null; if (maxWordCount < DEFAULT_MAX_WORD_COUNT) { //make a backup in case we exceed the word count backup = new char[termBufferLength]; Array.Copy(termBuffer, 0, backup, 0, termBufferLength); } if (termBufferLength < maxTokenLength) { int wordCount = 0; int lastWordStart = 0; for (int i = 0; i < termBufferLength; i++) { char c = termBuffer[i]; if (c <= ' ' || c == '.') { int len = i - lastWordStart; if (len > 0) { ProcessWord(termBuffer, lastWordStart, len, wordCount++); lastWordStart = i + 1; i++; } } } // process the last word if (lastWordStart < termBufferLength) { ProcessWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++); } if (wordCount > maxWordCount) { termAtt.CopyBuffer(backup, 0, termBufferLength); } } return(true); }
public override bool Accept() { return words.Contains(termAtt.Buffer(), 0, termAtt.Length); }
public override bool IncrementToken() { while (true) { if (curTermBuffer == null) { if (!input.IncrementToken()) { return(false); } else { curTermBuffer = termAtt.Buffer().Clone(); curTermLength = termAtt.Length(); curCodePointCount = charUtils.codePointCount(termAtt); curGramSize = minGram; tokStart = offsetAtt.StartOffset(); tokEnd = offsetAtt.EndOffset(); if (version.OnOrAfter(Version.LUCENE_44)) { // Never update offsets updateOffsets = false; } else { // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. updateOffsets = (tokStart + curTermLength) == tokEnd; } savePosIncr += posIncrAtt.PositionIncrement; savePosLen = posLenAtt.PositionLength; } } if (curGramSize <= maxGram) // if we have hit the end of our n-gram size range, quit { if (curGramSize <= curCodePointCount) // if the remaining input is too short, we can't generate any n-grams { // grab gramSize chars from front or back int start = side == Side.FRONT ? 0 : charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize); int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); ClearAttributes(); if (updateOffsets) { offsetAtt.SetOffset(tokStart + start, tokStart + end); } else { offsetAtt.SetOffset(tokStart, tokEnd); } // first ngram gets increment, others don't if (curGramSize == minGram) { posIncrAtt.PositionIncrement = savePosIncr; savePosIncr = 0; } else { posIncrAtt.PositionIncrement = 0; } posLenAtt.PositionLength = savePosLen; termAtt.CopyBuffer(curTermBuffer, start, end - start); curGramSize++; return(true); } } curTermBuffer = null; } }
public override bool IncrementToken() { ClearAttributes(); int length = 0; int start = -1; // this variable is always initialized int end_Renamed = -1; char[] buffer = termAtt.Buffer(); while (true) { if (bufferIndex >= dataLen) { offset += dataLen; charUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils if (ioBuffer.Length == 0) { dataLen = 0; // so next offset += dataLen won't decrement offset if (length > 0) { break; } else { finalOffset = CorrectOffset(offset); return(false); } } dataLen = ioBuffer.Length; bufferIndex = 0; } // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone int c = charUtils.codePointAt(ioBuffer.Buffer, bufferIndex, ioBuffer.Length); int charCount = Character.CharCount(c); bufferIndex += charCount; if (isTokenChar(c)) // if it's a token char { if (length == 0) // start of token { Debug.Assert(start == -1); start = offset + bufferIndex - charCount; end_Renamed = start; } // check if a supplementary could run out of bounds else if (length >= buffer.Length - 1) { buffer = termAtt.ResizeBuffer(2 + length); // make sure a supplementary fits in the buffer } end_Renamed += charCount; length += Character.ToChars(Normalize(c), buffer, length); // buffer it, normalized if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test { break; } } // at non-Letter w/ chars else if (length > 0) { break; // return 'em } } termAtt.Length = length; Debug.Assert(start != -1); offsetAtt.SetOffset(CorrectOffset(start), finalOffset = CorrectOffset(end_Renamed)); return(true); }
/// <summary> /// Returns the next token in the stream, or null at EOS. /// </summary> public override bool IncrementToken() { while (true) { if (curTermBuffer == null) { if (!input.IncrementToken()) { return(false); } else { curTermBuffer = termAtt.Buffer().Clone(); curTermLength = termAtt.Length; curCodePointCount = charUtils.CodePointCount(termAtt); curGramSize = minGram; curPos = 0; curPosInc = posIncAtt.PositionIncrement; curPosLen = posLenAtt.PositionLength; tokStart = offsetAtt.StartOffset(); tokEnd = offsetAtt.EndOffset(); // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = (tokStart + curTermLength) != tokEnd; } } if (version.OnOrAfter(Version.LUCENE_44)) { if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) { ++curPos; curGramSize = minGram; } if ((curPos + curGramSize) <= curCodePointCount) { ClearAttributes(); int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos); int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); termAtt.CopyBuffer(curTermBuffer, start, end - start); posIncAtt.PositionIncrement = curPosInc; curPosInc = 0; posLenAtt.PositionLength = curPosLen; offsetAtt.SetOffset(tokStart, tokEnd); curGramSize++; return(true); } } else { while (curGramSize <= maxGram) { while (curPos + curGramSize <= curTermLength) // while there is input { ClearAttributes(); termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize); if (hasIllegalOffsets) { offsetAtt.SetOffset(tokStart, tokEnd); } else { offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize); } curPos++; return(true); } curGramSize++; // increase n-gram size curPos = 0; } } curTermBuffer = null; } }