예제 #1
0
 /// <summary>
 /// Saves this information to form the left part of a gram
 /// </summary>
 private void SaveTermBuffer()
 {
     buffer.Length = 0;
     buffer.Append(termAttribute.Buffer(), 0, termAttribute.Length);
     buffer.Append(SEPARATOR);
     lastStartOffset = offsetAttribute.StartOffset();
     lastWasCommon   = Common;
 }
예제 #2
0
        /// <summary>
        /// {@inheritDoc}
        /// </summary>
        public override bool IncrementToken()
        {
            while (input.IncrementToken())
            {
                char[] term         = termAttribute.Buffer();
                int    length       = termAttribute.Length;
                int    posIncrement = posIncAttribute.PositionIncrement;

                if (posIncrement > 0)
                {
                    previous.Clear();
                }

                bool duplicate = (posIncrement == 0 && previous.Contains(term, 0, length));

                // clone the term, and add to the set of seen terms.
                char[] saved = new char[length];
                Array.Copy(term, 0, saved, 0, length);
                previous.Add(saved);

                if (!duplicate)
                {
                    return(true);
                }
            }
            return(false);
        }
예제 #3
0
 public override bool IncrementToken()
 {
     if (!done)
     {
         ClearAttributes();
         done = true;
         int    upto   = 0;
         char[] buffer = termAtt.Buffer();
         while (true)
         {
             int length = input.Read(buffer, upto, buffer.Length - upto);
             if (length == -1)
             {
                 break;
             }
             upto += length;
             if (upto == buffer.Length)
             {
                 buffer = termAtt.ResizeBuffer(1 + buffer.Length);
             }
         }
         termAtt.Length = upto;
         finalOffset    = CorrectOffset(upto);
         offsetAtt.SetOffset(CorrectOffset(0), finalOffset);
         return(true);
     }
     return(false);
 }
예제 #4
0
        /// <summary>
        /// Increments the <seealso cref="TokenStream"/> with a <seealso cref="CharTermAttribute"/> without elisioned start
        /// </summary>
        public override bool IncrementToken()
        {
            if (input.IncrementToken())
            {
                char[] termBuffer = termAtt.Buffer();
                int    termLength = termAtt.Length;

                int index = -1;
                for (int i = 0; i < termLength; i++)
                {
                    char ch = termBuffer[i];
                    if (ch == '\'' || ch == '\u2019')
                    {
                        index = i;
                        break;
                    }
                }

                // An apostrophe has been found. If the prefix is an article strip it off.
                if (index >= 0 && articles.Contains(termBuffer, 0, index))
                {
                    termAtt.CopyBuffer(termBuffer, index + 1, termLength - (index + 1));
                }

                return(true);
            }
            else
            {
                return(false);
            }
        }
예제 #5
0
        public override bool IncrementToken()
        {
            if (endState != null)
            {
                return(false);
            }

            if (!Input.IncrementToken())
            {
                return(false);
            }

            int skippedPositions = 0;

            while (true)
            {
                if (stopWords.Contains(termAtt.Buffer(), 0, termAtt.Length))
                {
                    int posInc    = posIncAtt.PositionIncrement;
                    int endOffset = offsetAtt.EndOffset();
                    // This token may be a stopword, if it's not end:
                    State sav = CaptureState();
                    if (Input.IncrementToken())
                    {
                        // It was a stopword; skip it
                        skippedPositions += posInc;
                    }
                    else
                    {
                        ClearAttributes();
                        Input.End();
                        endState = CaptureState();
                        int finalEndOffset = offsetAtt.EndOffset();
                        Debug.Assert(finalEndOffset >= endOffset);
                        if (finalEndOffset > endOffset)
                        {
                            // OK there was a token separator after the
                            // stopword, so it was a stopword
                            return(false);
                        }
                        else
                        {
                            // No token separator after final token that
                            // looked like a stop-word; don't filter it:
                            RestoreState(sav);
                            posIncAtt.PositionIncrement = skippedPositions + posIncAtt.PositionIncrement;
                            keywordAtt.Keyword          = true;
                            return(true);
                        }
                    }
                }
                else
                {
                    // Not a stopword; return the current token:
                    posIncAtt.PositionIncrement = skippedPositions + posIncAtt.PositionIncrement;
                    return(true);
                }
            }
        }
 private void AssertEquals(CharTermAttribute term, string expected)
 {
     assertEquals(expected.Length, term.Length);
     char[] buffer = term.Buffer();
     for (int chIDX = 0; chIDX < expected.Length; chIDX++)
     {
         assertEquals(expected[chIDX], buffer[chIDX]);
     }
 }
예제 #7
0
        public override bool IncrementToken()
        {
            ClearAttributes();

            // termination of this loop is guaranteed by the fact that every iteration
            // either advances the buffer (calls consumes()) or increases gramSize
            while (true)
            {
                // compact
                if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted)
                {
                    Array.Copy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
                    bufferEnd        -= bufferStart;
                    lastCheckedChar  -= bufferStart;
                    lastNonTokenChar -= bufferStart;
                    bufferStart       = 0;

                    // fill in remaining space
                    exhausted = !charUtils.Fill(charBuffer, input, buffer.Length - bufferEnd);
                    // convert to code points
                    bufferEnd += charUtils.toCodePoints(charBuffer.Buffer, 0, charBuffer.Length, buffer, bufferEnd);
                }

                // should we go to the next offset?
                if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd)
                {
                    if (bufferStart + 1 + minGram > bufferEnd)
                    {
                        Debug.Assert(exhausted);
                        return(false);
                    }
                    consume();
                    gramSize = minGram;
                }

                updateLastNonTokenChar();

                // retry if the token to be emitted was going to not only contain token chars
                bool termContainsNonTokenChar         = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize);
                bool isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1;
                if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar)
                {
                    consume();
                    gramSize = minGram;
                    continue;
                }

                int length = charUtils.toChars(buffer, bufferStart, gramSize, termAtt.Buffer(), 0);
                termAtt.Length = length;
                posIncAtt.PositionIncrement = 1;
                posLenAtt.PositionLength    = 1;
                offsetAtt.SetOffset(CorrectOffset(offset), CorrectOffset(offset + length));
                ++gramSize;
                return(true);
            }
        }
예제 #8
0
 public override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         int len = termAtt.Length;
         if (marker != NOMARKER)
         {
             len++;
             termAtt.ResizeBuffer(len);
             termAtt.Buffer()[len - 1] = marker;
         }
         reverse(matchVersion, termAtt.Buffer(), 0, len);
         termAtt.Length = len;
         return(true);
     }
     else
     {
         return(false);
     }
 }
예제 #9
0
 public override bool IncrementToken()
 {
     if (Index <= 'z')
     {
         ClearAttributes();
         TermAtt.Length      = 1;
         TermAtt.Buffer()[0] = (char)Index++;
         return(true);
     }
     return(false);
 }
예제 #10
0
 public override bool IncrementToken()
 {
     if (input.incrementToken())
     {
         charUtils.ToLower(termAtt.Buffer(), 0, termAtt.Length);
         return(true);
     }
     else
     {
         return(false);
     }
 }
예제 #11
0
        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                return(false);
            }

            char[] termBuffer       = termAtt.Buffer();
            int    termBufferLength = termAtt.Length;

            char[] backup = null;

            if (maxWordCount < DEFAULT_MAX_WORD_COUNT)
            {
                //make a backup in case we exceed the word count
                backup = new char[termBufferLength];
                Array.Copy(termBuffer, 0, backup, 0, termBufferLength);
            }

            if (termBufferLength < maxTokenLength)
            {
                int wordCount = 0;

                int lastWordStart = 0;
                for (int i = 0; i < termBufferLength; i++)
                {
                    char c = termBuffer[i];
                    if (c <= ' ' || c == '.')
                    {
                        int len = i - lastWordStart;
                        if (len > 0)
                        {
                            ProcessWord(termBuffer, lastWordStart, len, wordCount++);
                            lastWordStart = i + 1;
                            i++;
                        }
                    }
                }

                // process the last word
                if (lastWordStart < termBufferLength)
                {
                    ProcessWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++);
                }

                if (wordCount > maxWordCount)
                {
                    termAtt.CopyBuffer(backup, 0, termBufferLength);
                }
            }

            return(true);
        }
예제 #12
0
 public override bool Accept()
 {
     return words.Contains(termAtt.Buffer(), 0, termAtt.Length);
 }
예제 #13
0
 public override bool IncrementToken()
 {
     while (true)
     {
         if (curTermBuffer == null)
         {
             if (!input.IncrementToken())
             {
                 return(false);
             }
             else
             {
                 curTermBuffer     = termAtt.Buffer().Clone();
                 curTermLength     = termAtt.Length();
                 curCodePointCount = charUtils.codePointCount(termAtt);
                 curGramSize       = minGram;
                 tokStart          = offsetAtt.StartOffset();
                 tokEnd            = offsetAtt.EndOffset();
                 if (version.OnOrAfter(Version.LUCENE_44))
                 {
                     // Never update offsets
                     updateOffsets = false;
                 }
                 else
                 {
                     // if length by start + end offsets doesn't match the term text then assume
                     // this is a synonym and don't adjust the offsets.
                     updateOffsets = (tokStart + curTermLength) == tokEnd;
                 }
                 savePosIncr += posIncrAtt.PositionIncrement;
                 savePosLen   = posLenAtt.PositionLength;
             }
         }
         if (curGramSize <= maxGram)               // if we have hit the end of our n-gram size range, quit
         {
             if (curGramSize <= curCodePointCount) // if the remaining input is too short, we can't generate any n-grams
             {
                 // grab gramSize chars from front or back
                 int start = side == Side.FRONT ? 0 : charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize);
                 int end   = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                 ClearAttributes();
                 if (updateOffsets)
                 {
                     offsetAtt.SetOffset(tokStart + start, tokStart + end);
                 }
                 else
                 {
                     offsetAtt.SetOffset(tokStart, tokEnd);
                 }
                 // first ngram gets increment, others don't
                 if (curGramSize == minGram)
                 {
                     posIncrAtt.PositionIncrement = savePosIncr;
                     savePosIncr = 0;
                 }
                 else
                 {
                     posIncrAtt.PositionIncrement = 0;
                 }
                 posLenAtt.PositionLength = savePosLen;
                 termAtt.CopyBuffer(curTermBuffer, start, end - start);
                 curGramSize++;
                 return(true);
             }
         }
         curTermBuffer = null;
     }
 }
예제 #14
0
        public override bool IncrementToken()
        {
            ClearAttributes();
            int length      = 0;
            int start       = -1; // this variable is always initialized
            int end_Renamed = -1;

            char[] buffer = termAtt.Buffer();
            while (true)
            {
                if (bufferIndex >= dataLen)
                {
                    offset += dataLen;
                    charUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils
                    if (ioBuffer.Length == 0)
                    {
                        dataLen = 0; // so next offset += dataLen won't decrement offset
                        if (length > 0)
                        {
                            break;
                        }
                        else
                        {
                            finalOffset = CorrectOffset(offset);
                            return(false);
                        }
                    }
                    dataLen     = ioBuffer.Length;
                    bufferIndex = 0;
                }
                // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
                int c         = charUtils.codePointAt(ioBuffer.Buffer, bufferIndex, ioBuffer.Length);
                int charCount = Character.CharCount(c);
                bufferIndex += charCount;

                if (isTokenChar(c))  // if it's a token char
                {
                    if (length == 0) // start of token
                    {
                        Debug.Assert(start == -1);
                        start       = offset + bufferIndex - charCount;
                        end_Renamed = start;
                    } // check if a supplementary could run out of bounds
                    else if (length >= buffer.Length - 1)
                    {
                        buffer = termAtt.ResizeBuffer(2 + length); // make sure a supplementary fits in the buffer
                    }
                    end_Renamed += charCount;
                    length      += Character.ToChars(Normalize(c), buffer, length); // buffer it, normalized
                    if (length >= MAX_WORD_LEN)                                     // buffer overflow! make sure to check for >= surrogate pair could break == test
                    {
                        break;
                    }
                } // at non-Letter w/ chars
                else if (length > 0)
                {
                    break; // return 'em
                }
            }

            termAtt.Length = length;
            Debug.Assert(start != -1);
            offsetAtt.SetOffset(CorrectOffset(start), finalOffset = CorrectOffset(end_Renamed));
            return(true);
        }
예제 #15
0
 /// <summary>
 /// Returns the next token in the stream, or null at EOS.
 /// </summary>
 public override bool IncrementToken()
 {
     while (true)
     {
         if (curTermBuffer == null)
         {
             if (!input.IncrementToken())
             {
                 return(false);
             }
             else
             {
                 curTermBuffer     = termAtt.Buffer().Clone();
                 curTermLength     = termAtt.Length;
                 curCodePointCount = charUtils.CodePointCount(termAtt);
                 curGramSize       = minGram;
                 curPos            = 0;
                 curPosInc         = posIncAtt.PositionIncrement;
                 curPosLen         = posLenAtt.PositionLength;
                 tokStart          = offsetAtt.StartOffset();
                 tokEnd            = offsetAtt.EndOffset();
                 // if length by start + end offsets doesn't match the term text then assume
                 // this is a synonym and don't adjust the offsets.
                 hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
             }
         }
         if (version.OnOrAfter(Version.LUCENE_44))
         {
             if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount)
             {
                 ++curPos;
                 curGramSize = minGram;
             }
             if ((curPos + curGramSize) <= curCodePointCount)
             {
                 ClearAttributes();
                 int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
                 int end   = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                 termAtt.CopyBuffer(curTermBuffer, start, end - start);
                 posIncAtt.PositionIncrement = curPosInc;
                 curPosInc = 0;
                 posLenAtt.PositionLength = curPosLen;
                 offsetAtt.SetOffset(tokStart, tokEnd);
                 curGramSize++;
                 return(true);
             }
         }
         else
         {
             while (curGramSize <= maxGram)
             {
                 while (curPos + curGramSize <= curTermLength) // while there is input
                 {
                     ClearAttributes();
                     termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize);
                     if (hasIllegalOffsets)
                     {
                         offsetAtt.SetOffset(tokStart, tokEnd);
                     }
                     else
                     {
                         offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
                     }
                     curPos++;
                     return(true);
                 }
                 curGramSize++; // increase n-gram size
                 curPos = 0;
             }
         }
         curTermBuffer = null;
     }
 }