Пример #1
0
        /// <summary>Returns the next token in the stream, or null at EOS.
        /// <p/>Removes <tt>'s</tt> from the end of words.
        /// <p/>Removes dots from acronyms.
        /// </summary>
        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                return(false);
            }

            char[] buffer       = termAtt.TermBuffer();
            int    bufferLength = termAtt.TermLength();

            System.String type = typeAtt.Type();

            if ((System.Object)type == (System.Object)APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
            {
                // Strip last 2 characters off
                termAtt.SetTermLength(bufferLength - 2);
            }
            else if ((System.Object)type == (System.Object)ACRONYM_TYPE)
            {
                // remove dots
                int upto = 0;
                for (int i = 0; i < bufferLength; i++)
                {
                    char c = buffer[i];
                    if (c != '.')
                    {
                        buffer[upto++] = c;
                    }
                }
                termAtt.SetTermLength(upto);
            }

            return(true);
        }
Пример #2
0
 public override bool IncrementToken()
 {
     if (!done)
     {
         ClearAttributes();
         done = true;
         int    upto   = 0;
         char[] buffer = termAtt.TermBuffer();
         while (true)
         {
             int length = input.Read(buffer, upto, buffer.Length - upto);
             if (length == 0)
             {
                 break;
             }
             upto += length;
             if (upto == buffer.Length)
             {
                 buffer = termAtt.ResizeTermBuffer(1 + buffer.Length);
             }
         }
         termAtt.SetTermLength(upto);
         finalOffset = CorrectOffset(upto);
         offsetAtt.SetOffset(CorrectOffset(0), finalOffset);
         return(true);
     }
     return(false);
 }
Пример #3
0
        /*
         * (non-Javadoc)
         *
         * @see Lucene.Net.Analysis.TokenStream#next()
         */
        public override bool IncrementToken()
        {
            ClearAttributes();
            int posIncr = 1;

            while (true)
            {
                int tokenType = scanner.GetNextToken();

                if (tokenType == StandardTokenizerImpl.YYEOF)
                {
                    return(false);
                }

                if (scanner.Yylength() <= maxTokenLength)
                {
                    posIncrAtt.SetPositionIncrement(posIncr);
                    scanner.GetText(termAtt);
                    int start = scanner.Yychar();
                    offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.TermLength()));
                    // This 'if' should be removed in the next release. For now, it converts
                    // invalid acronyms to HOST. When removed, only the 'else' part should
                    // remain.
                    if (tokenType == StandardTokenizerImpl.ACRONYM_DEP)
                    {
                        if (replaceInvalidAcronym)
                        {
                            typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
                            termAtt.SetTermLength(termAtt.TermLength() - 1);                             // remove extra '.'
                        }
                        else
                        {
                            typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
                        }
                    }
                    else
                    {
                        typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
                    }
                    return(true);
                }
                // When we skip a too-long term, we still increment the
                // position increment
                else
                {
                    posIncr++;
                }
            }
        }
Пример #4
0
                public override bool IncrementToken()
                {
                    int count = input.Read((System.Char[])buffer, 0, buffer.Length);

                    if (done)
                    {
                        return(false);
                    }
                    else
                    {
                        ClearAttributes();
                        done = true;
                        if (count == 1)
                        {
                            termAtt.TermBuffer()[0] = buffer[0];
                            termAtt.SetTermLength(1);
                        }
                        else
                        {
                            termAtt.SetTermLength(0);
                        }
                        return(true);
                    }
                }
Пример #5
0
        public override bool IncrementToken()
        {
            ClearAttributes();
            int length = 0;
            int start  = bufferIndex;

            char[] buffer = termAtt.TermBuffer();
            while (true)
            {
                if (bufferIndex >= dataLen)
                {
                    offset += dataLen;
                    dataLen = input.Read((System.Char[])ioBuffer, 0, ioBuffer.Length);
                    if (dataLen <= 0)
                    {
                        dataLen = 0;                         // so next offset += dataLen won't decrement offset
                        if (length > 0)
                        {
                            break;
                        }
                        else
                        {
                            return(false);
                        }
                    }
                    bufferIndex = 0;
                }

                char c = ioBuffer[bufferIndex++];

                if (IsTokenChar(c))
                {
                    // if it's a token char

                    if (length == 0)
                    {
                        // start of token
                        start = offset + bufferIndex - 1;
                    }
                    else if (length == buffer.Length)
                    {
                        buffer = termAtt.ResizeTermBuffer(1 + length);
                    }

                    buffer[length++] = Normalize(c);                     // buffer it, normalized

                    if (length == MAX_WORD_LEN)
                    {
                        // buffer overflow!
                        break;
                    }
                }
                else if (length > 0)
                {
                    // at non-Letter w/ chars
                    break;                     // return 'em
                }
            }

            termAtt.SetTermLength(length);
            offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
            return(true);
        }