Пример #1
0
        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                return(false);
            }

            char[] termBuffer       = termAtt.Buffer();
            int    termBufferLength = termAtt.Length;

            char[] backup = null;

            if (maxWordCount < DEFAULT_MAX_WORD_COUNT)
            {
                //make a backup in case we exceed the word count
                backup = new char[termBufferLength];
                Array.Copy(termBuffer, 0, backup, 0, termBufferLength);
            }

            if (termBufferLength < maxTokenLength)
            {
                int wordCount = 0;

                int lastWordStart = 0;
                for (int i = 0; i < termBufferLength; i++)
                {
                    char c = termBuffer[i];
                    if (c <= ' ' || c == '.')
                    {
                        int len = i - lastWordStart;
                        if (len > 0)
                        {
                            ProcessWord(termBuffer, lastWordStart, len, wordCount++);
                            lastWordStart = i + 1;
                            i++;
                        }
                    }
                }

                // process the last word
                if (lastWordStart < termBufferLength)
                {
                    ProcessWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++);
                }

                if (wordCount > maxWordCount)
                {
                    termAtt.CopyBuffer(backup, 0, termBufferLength);
                }
            }

            return(true);
        }
Пример #2
0
 public override bool IncrementToken()
 {
     while (true)
     {
         if (curTermBuffer == null)
         {
             if (!input.IncrementToken())
             {
                 return(false);
             }
             else
             {
                 curTermBuffer     = termAtt.Buffer().Clone();
                 curTermLength     = termAtt.Length();
                 curCodePointCount = charUtils.codePointCount(termAtt);
                 curGramSize       = minGram;
                 tokStart          = offsetAtt.StartOffset();
                 tokEnd            = offsetAtt.EndOffset();
                 if (version.OnOrAfter(Version.LUCENE_44))
                 {
                     // Never update offsets
                     updateOffsets = false;
                 }
                 else
                 {
                     // if length by start + end offsets doesn't match the term text then assume
                     // this is a synonym and don't adjust the offsets.
                     updateOffsets = (tokStart + curTermLength) == tokEnd;
                 }
                 savePosIncr += posIncrAtt.PositionIncrement;
                 savePosLen   = posLenAtt.PositionLength;
             }
         }
         if (curGramSize <= maxGram)               // if we have hit the end of our n-gram size range, quit
         {
             if (curGramSize <= curCodePointCount) // if the remaining input is too short, we can't generate any n-grams
             {
                 // grab gramSize chars from front or back
                 int start = side == Side.FRONT ? 0 : charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize);
                 int end   = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                 ClearAttributes();
                 if (updateOffsets)
                 {
                     offsetAtt.SetOffset(tokStart + start, tokStart + end);
                 }
                 else
                 {
                     offsetAtt.SetOffset(tokStart, tokEnd);
                 }
                 // first ngram gets increment, others don't
                 if (curGramSize == minGram)
                 {
                     posIncrAtt.PositionIncrement = savePosIncr;
                     savePosIncr = 0;
                 }
                 else
                 {
                     posIncrAtt.PositionIncrement = 0;
                 }
                 posLenAtt.PositionLength = savePosLen;
                 termAtt.CopyBuffer(curTermBuffer, start, end - start);
                 curGramSize++;
                 return(true);
             }
         }
         curTermBuffer = null;
     }
 }
Пример #3
0
 /// <summary>
 /// Returns the next token in the stream, or null at EOS.
 /// </summary>
 public override bool IncrementToken()
 {
     while (true)
     {
         if (curTermBuffer == null)
         {
             if (!input.IncrementToken())
             {
                 return(false);
             }
             else
             {
                 curTermBuffer     = termAtt.Buffer().Clone();
                 curTermLength     = termAtt.Length;
                 curCodePointCount = charUtils.CodePointCount(termAtt);
                 curGramSize       = minGram;
                 curPos            = 0;
                 curPosInc         = posIncAtt.PositionIncrement;
                 curPosLen         = posLenAtt.PositionLength;
                 tokStart          = offsetAtt.StartOffset();
                 tokEnd            = offsetAtt.EndOffset();
                 // if length by start + end offsets doesn't match the term text then assume
                 // this is a synonym and don't adjust the offsets.
                 hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
             }
         }
         if (version.OnOrAfter(Version.LUCENE_44))
         {
             if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount)
             {
                 ++curPos;
                 curGramSize = minGram;
             }
             if ((curPos + curGramSize) <= curCodePointCount)
             {
                 ClearAttributes();
                 int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
                 int end   = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                 termAtt.CopyBuffer(curTermBuffer, start, end - start);
                 posIncAtt.PositionIncrement = curPosInc;
                 curPosInc = 0;
                 posLenAtt.PositionLength = curPosLen;
                 offsetAtt.SetOffset(tokStart, tokEnd);
                 curGramSize++;
                 return(true);
             }
         }
         else
         {
             while (curGramSize <= maxGram)
             {
                 while (curPos + curGramSize <= curTermLength) // while there is input
                 {
                     ClearAttributes();
                     termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize);
                     if (hasIllegalOffsets)
                     {
                         offsetAtt.SetOffset(tokStart, tokEnd);
                     }
                     else
                     {
                         offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
                     }
                     curPos++;
                     return(true);
                 }
                 curGramSize++; // increase n-gram size
                 curPos = 0;
             }
         }
         curTermBuffer = null;
     }
 }