Пример #1
0
        /// <summary>
        /// Returns the next token in the stream, or null at EOS.
        /// </summary>
        public override sealed bool IncrementToken()
        {
            while (true)
            {
                if (curTermBuffer == null)
                {
                    if (!m_input.IncrementToken())
                    {
                        return(false);
                    }
                    else
                    {
                        curTermBuffer     = (char[])termAtt.Buffer.Clone();
                        curTermLength     = termAtt.Length;
                        curCodePointCount = charUtils.CodePointCount(termAtt.ToString());
                        curGramSize       = minGram;
                        curPos            = 0;
                        curPosInc         = posIncAtt.PositionIncrement;
                        curPosLen         = posLenAtt.PositionLength;
                        tokStart          = offsetAtt.StartOffset;
                        tokEnd            = offsetAtt.EndOffset;
                        // if length by start + end offsets doesn't match the term text then assume
                        // this is a synonym and don't adjust the offsets.
                        hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
                    }
                }
#pragma warning disable 612, 618
                if (version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
                {
                    if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount)
                    {
                        ++curPos;
                        curGramSize = minGram;
                    }
                    if ((curPos + curGramSize) <= curCodePointCount)
                    {
                        ClearAttributes();
                        int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
                        int end   = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                        termAtt.CopyBuffer(curTermBuffer, start, end - start);
                        posIncAtt.PositionIncrement = curPosInc;
                        curPosInc = 0;
                        posLenAtt.PositionLength = curPosLen;
                        offsetAtt.SetOffset(tokStart, tokEnd);
                        curGramSize++;
                        return(true);
                    }
                }
                else
                {
                    while (curGramSize <= maxGram)
                    {
                        while (curPos + curGramSize <= curTermLength) // while there is input
                        {
                            ClearAttributes();
                            termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize);
                            if (hasIllegalOffsets)
                            {
                                offsetAtt.SetOffset(tokStart, tokEnd);
                            }
                            else
                            {
                                offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
                            }
                            curPos++;
                            return(true);
                        }
                        curGramSize++; // increase n-gram size
                        curPos = 0;
                    }
                }
                curTermBuffer = null;
            }
        }
Пример #2
0
        public override sealed bool IncrementToken()
        {
            while (true)
            {
                if (curTermBuffer == null)
                {
                    if (!m_input.IncrementToken())
                    {
                        return(false);
                    }
                    else
                    {
                        curTermBuffer     = (char[])termAtt.Buffer.Clone();
                        curTermLength     = termAtt.Length;
                        curCodePointCount = charUtils.CodePointCount(termAtt);
                        curGramSize       = minGram;
                        tokStart          = offsetAtt.StartOffset;
                        tokEnd            = offsetAtt.EndOffset;
#pragma warning disable 612, 618
                        if (version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
                        {
                            // Never update offsets
                            updateOffsets = false;
                        }
                        else
                        {
                            // if length by start + end offsets doesn't match the term text then assume
                            // this is a synonym and don't adjust the offsets.
                            updateOffsets = (tokStart + curTermLength) == tokEnd;
                        }
                        savePosIncr += posIncrAtt.PositionIncrement;
                        savePosLen   = posLenAtt.PositionLength;
                    }
                }
                if (curGramSize <= maxGram)               // if we have hit the end of our n-gram size range, quit
                {
                    if (curGramSize <= curCodePointCount) // if the remaining input is too short, we can't generate any n-grams
                    {
                        // grab gramSize chars from front or back
                        int start = side == Side.FRONT ? 0 : charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize);
                        int end   = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                        ClearAttributes();
                        if (updateOffsets)
                        {
                            offsetAtt.SetOffset(tokStart + start, tokStart + end);
                        }
                        else
                        {
                            offsetAtt.SetOffset(tokStart, tokEnd);
                        }
                        // first ngram gets increment, others don't
                        if (curGramSize == minGram)
                        {
                            posIncrAtt.PositionIncrement = savePosIncr;
                            savePosIncr = 0;
                        }
                        else
                        {
                            posIncrAtt.PositionIncrement = 0;
                        }
                        posLenAtt.PositionLength = savePosLen;
                        termAtt.CopyBuffer(curTermBuffer, start, end - start);
                        curGramSize++;
                        return(true);
                    }
                }
                curTermBuffer = null;
            }
        }