示例#1
0
        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                return(false);
            }
            bool bufferUpdated = true;

            char[] buffer       = termAtt.TermBuffer();
            int    bufferLength = termAtt.TermLength();
            String type         = typeAtt.Type;

            if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
            {
                for (int i = 0; i < bufferLength - 2; i++)
                {
                    buffer[i] = ToLower(buffer[i]);
                }
                // Strip last 2 characters off
                termAtt.SetTermLength(bufferLength - 2);
            }
            else if (type == ACRONYM_TYPE)
            {
                // remove dots
                int upto = 0;
                for (int i = 0; i < bufferLength; i++)
                {
                    char c = buffer[i];
                    if (c != '.')
                    {
                        buffer[upto++] = ToLower(c);
                    }
                }
                termAtt.SetTermLength(upto);
            }
            else
            {
                do
                {
                    //If we consumed a stop word we need to update the buffer and its length.
                    if (!bufferUpdated)
                    {
                        bufferLength = termAtt.TermLength();
                        buffer       = termAtt.TermBuffer();
                    }

                    for (int i = 0; i < bufferLength; i++)
                    {
                        buffer[i] = ToLower(buffer[i]);
                    }
                    if (!stopWords.Contains(buffer, 0, bufferLength))
                    {
                        return(true);
                    }
                    bufferUpdated = false;
                } while (input.IncrementToken());
                return(false);
            }
            return(true);
        }
示例#2
0
        /// <summary>
        /// Basically is called on each token
        /// </summary>
        /// <returns></returns>
        public override bool IncrementToken()
        {
            if (input.IncrementToken())
            {
                string term        = new String(termAtt.TermBuffer(), 0, termAtt.TermLength());
                string replaceterm = BestBetsWordForms.ResourceManager.GetString(term);

                if (!String.IsNullOrEmpty(replaceterm))
                {
                    //Replace Term was a match, so we need to replace this item.
                    termAtt.SetTermBuffer(replaceterm);
                }


                char[] buffer = termAtt.TermBuffer();

                /*
                 *              int length = termAtt.TermLength();
                 *              for (int i = 0; i < length; i++)
                 *                      buffer[i] = System.Char.ToLower(buffer[i]);
                 */

                return(true);
            }
            return(false);
        }
示例#3
0
                public override bool IncrementToken()
                {
                    if (state != null)
                    {
                        RestoreState(state);
                        payloadAtt.Payload = null;
                        posIncrAtt.PositionIncrement = 0;
                        termAtt.SetTermBuffer(new char[]{'b'}, 0, 1);
                        state = null;
                        return true;
                    }

                    bool hasNext = input.IncrementToken();
                    if (!hasNext)
                        return false;
                    if (System.Char.IsDigit(termAtt.TermBuffer()[0]))
                    {
                        posIncrAtt.PositionIncrement = termAtt.TermBuffer()[0] - '0';
                    }
                    if (first)
                    {
                        // set payload on first position only
                        payloadAtt.Payload = new Payload(new byte[]{100});
                        first = false;
                    }

                    // index a "synonym" for every token
                    state = CaptureState();
                    return true;
                }
示例#4
0
        public override Boolean IncrementToken()
        {
            if (_buffer.Any())
            {
                var nextStem = _buffer.Dequeue();

                RestoreState(_savedState);
                _posIncAtt.PositionIncrement = 0;
                _termAtt.SetTermBuffer(nextStem.Stem, 0, nextStem.StemLength);
                return(true);
            }

            if (!input.IncrementToken())
            {
                return(false);
            }

            var newTerms = _dedup
                               ? _stemmer.UniqueStems(_termAtt.Term)
                               : _stemmer.Stem(_termAtt.Term);

            foreach (var newTerm in newTerms)
            {
                _buffer.Enqueue(newTerm);
            }

            if (_buffer.Count == 0)
            {
                // originaly: we do not know this word, return it unchanged
                // changed: apply SlovakStemmer on words not found in dictionary (possible named entities)
                var currentTerm = new string(_termAtt.TermBuffer(), 0, _termAtt.TermLength());
                if (!string.IsNullOrEmpty(currentTerm))
                {
                    _slovakStemmer.Stem(_termAtt.TermBuffer(), _termAtt.TermLength(), out char[] newTerm, out var newLength);
                    _termAtt.SetTermBuffer(newTerm, 0, newLength);
                    _termAtt.SetTermLength(newLength);
                }
                return(true);
            }

            var stem = _buffer.Dequeue();

            _termAtt.SetTermBuffer(stem.Stem, 0, stem.StemLength);

            if (_buffer.Count > 0)
            {
                _savedState = CaptureState();
            }

            return(true);
        }
示例#5
0
            public override bool IncrementToken()
            {
                int positionIncrement = 0;

                if (_buffer == null || _offset >= _length)
                {
                    if (!_input.IncrementToken())
                    {
                        return(false);
                    }

                    _offset = 0;
                    _buffer = _termAttribute.TermBuffer();
                    _length = _termAttribute.TermLength();
                    positionIncrement++;
                    _offsetInStream++;
                }

                _offsetAttribute.SetOffset(_offsetInStream, _offsetInStream + 1);
                _offsetInStream++;

                positionIncrement++;
                _positionIncrementAttribute.PositionIncrement = positionIncrement;

                _termAttribute.SetTermLength(1);
                _termAttribute.SetTermBuffer(_buffer[_offset++].ToString());

                return(true);
            }
示例#6
0
 /* Returns the next token in the stream, or null at EOS. */
 public override bool IncrementToken()
 {
     while (true)
     {
         if (curTermBuffer == null)
         {
             if (!input.IncrementToken())
             {
                 return(false);
             }
             else
             {
                 curTermBuffer = (char[])termAtt.TermBuffer().Clone();
                 curTermLength = termAtt.TermLength();
                 curGramSize   = minGram;
                 curPos        = 0;
                 tokStart      = offsetAtt.StartOffset;
             }
         }
         while (curGramSize <= maxGram)
         {
             while (curPos + curGramSize <= curTermLength)
             {     // while there is input
                 ClearAttributes();
                 termAtt.SetTermBuffer(curTermBuffer, curPos, curGramSize);
                 offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
                 curPos++;
                 return(true);
             }
             curGramSize++;                         // increase n-gram size
             curPos = 0;
         }
         curTermBuffer = null;
     }
 }
示例#7
0
        public override bool IncrementToken()
        {
            while (input.IncrementToken())
            {
                char[] text       = termAtt.TermBuffer();
                int    termLength = termAtt.TermLength();

                // why not key off token type here assuming ChineseTokenizer comes first?
                if (!stopTable.Contains(text, 0, termLength))
                {
                    switch (char.GetUnicodeCategory(text[0]))
                    {
                    case UnicodeCategory.LowercaseLetter:
                    case UnicodeCategory.UppercaseLetter:
                        // English word/token should larger than 1 char.
                        if (termLength > 1)
                        {
                            return(true);
                        }
                        break;

                    case UnicodeCategory.OtherLetter:
                        // One Chinese char as one Chinese word.
                        // Chinese word extraction to be added later here.
                        return(true);
                    }
                }
            }
            return(false);
        }
        /// <summary>Returns the next token in the stream, or null at EOS.
        /// <p/>Removes <tt>'s</tt> from the end of words.
        /// <p/>Removes dots from acronyms.
        /// </summary>
        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                return(false);
            }

            char[] buffer       = termAtt.TermBuffer();
            int    bufferLength = termAtt.TermLength();

            System.String type = typeAtt.Type;

            if ((System.Object)type == (System.Object)APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
            {
                // Strip last 2 characters off
                termAtt.SetTermLength(bufferLength - 2);
            }
            else if ((System.Object)type == (System.Object)ACRONYM_TYPE)
            {
                // remove dots
                int upto = 0;
                for (int i = 0; i < bufferLength; i++)
                {
                    char c = buffer[i];
                    if (c != '.')
                    {
                        buffer[upto++] = c;
                    }
                }
                termAtt.SetTermLength(upto);
            }

            return(true);
        }
示例#9
0
        public override bool IncrementToken()
        {
            bool result = false;

            if (input.IncrementToken())
            {
                char[] buffer = termAtt.TermBuffer();
                int    length = termAtt.TermLength();
                //look for the delimiter
                bool seen = false;
                for (int i = 0; i < length; i++)
                {
                    if (buffer[i] == delimiter)
                    {
                        termAtt.SetTermBuffer(buffer, 0, i);
                        payAtt.Payload = encoder.Encode(buffer, i + 1, (length - (i + 1)));
                        seen           = true;
                        break;//at this point, we know the whole piece, so we can exit.  If we don't see the delimiter, then the termAtt is the same
                    }
                }
                if (seen == false)
                {
                    //no delimiter
                    payAtt.Payload = null;
                }
                result = true;
            }
            return(result);
        }
        public override bool IncrementToken()
        {
            ClearAttributes();
            int length = 0;
            int start  = bufferIndex;

            char[] buffer = termAtt.TermBuffer();
            while (true)
            {
                if (bufferIndex >= dataLen)
                {
                    offset += dataLen;
                    dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
                    if (dataLen <= 0)
                    {
                        dataLen = 0;                         // so next offset += dataLen won't decrement offset
                        if (length > 0)
                        {
                            break;
                        }
                        return(false);
                    }
                    bufferIndex = 0;
                }

                char c = ioBuffer[bufferIndex++];

                if (IsTokenChar(c))
                {
                    // if it's a token char

                    if (length == 0)
                    {
                        // start of token
                        start = offset + bufferIndex - 1;
                    }
                    else if (length == buffer.Length)
                    {
                        buffer = termAtt.ResizeTermBuffer(1 + length);
                    }

                    buffer[length++] = Normalize(c);                     // buffer it, normalized

                    if (length == MAX_WORD_LEN)
                    {
                        // buffer overflow!
                        break;
                    }
                }
                else if (length > 0)
                {
                    // at non-Letter w/ chars
                    break;                     // return 'em
                }
            }

            termAtt.SetTermLength(length);
            offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
            return(true);
        }
        public sealed override bool IncrementToken()
        {
            if (tokens.Count > 0)
            {
                setToken((Token)tokens.First.Value);
                tokens.RemoveFirst();
                return(true);
            }

            if (input.IncrementToken() == false)
            {
                return(false);
            }

            wrapper.SetTermBuffer(termAtt.TermBuffer(), 0, termAtt.TermLength());
            wrapper.StartOffset       = offsetAtt.StartOffset;
            wrapper.EndOffset         = offsetAtt.EndOffset;
            wrapper.Flags             = flagsAtt.Flags;
            wrapper.Type              = typeAtt.Type;
            wrapper.PositionIncrement = posIncAtt.PositionIncrement;
            wrapper.Payload           = payloadAtt.Payload;

            Decompose(wrapper);

            if (tokens.Count > 0)
            {
                setToken(tokens.First.Value);
                tokens.RemoveFirst();
                return(true);
            }
            else
            {
                return(false);
            }
        }
示例#12
0
    public override bool IncrementToken()
    {
        if (splittedQueue.Count > 0)
        {
            string splitted = splittedQueue.Dequeue();
            RestoreState(currentState);
            _termAttr.SetTermBuffer(splitted);
            _posAttr.PositionIncrement = 0;
            return(true);
        }
        if (!input.IncrementToken())
        {
            return(false);
        }
        var currentTerm = new string(_termAttr.TermBuffer(), 0, _termAttr.TermLength());
        IEnumerable <string> synonyms = SynonymEngine.GetSynonyms(currentTerm);

        if (synonyms == null)
        {
            return(false);
        }
        foreach (string syn in synonyms)
        {
            if (!currentTerm.Equals(syn))
            {
                splittedQueue.Enqueue(syn);
            }
        }
        return(true);
    }
示例#13
0
 public override bool IncrementToken()
 {
     if (!done)
     {
         ClearAttributes();
         done = true;
         int    upto   = 0;
         char[] buffer = termAtt.TermBuffer();
         while (true)
         {
             int length = input.Read(buffer, upto, buffer.Length - upto);
             if (length == 0)
             {
                 break;
             }
             upto += length;
             if (upto == buffer.Length)
             {
                 buffer = termAtt.ResizeTermBuffer(1 + buffer.Length);
             }
         }
         termAtt.SetTermLength(upto);
         finalOffset = CorrectOffset(upto);
         offsetAtt.SetOffset(CorrectOffset(0), finalOffset);
         return(true);
     }
     return(false);
 }
示例#14
0
        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                // reached EOS -- return null
                return(false);
            }

            if (suffixByTokenType == null)
            {
                return(true);
            }

            char[] suffix;
            if (!suffixByTokenType.TryGetValue(typeAtt.Type, out suffix))
            {
                return(true);
            }

            char[] buffer = termAtt.TermBuffer();
            int    length = termAtt.TermLength();

            if (buffer.Length <= length)
            {
                buffer = termAtt.ResizeTermBuffer(length + suffix.Length);
            }

            Array.Copy(suffix, 0, buffer, length, suffix.Length);
            termAtt.SetTermLength(length + suffix.Length);

            return(true);
        }
示例#15
0
        public override bool IncrementToken()
        {
            //Finchè ci sono termini nella coda ritorniamo e svuotiamo un temrine alla volta
            if (splittedQueue.Count > 0)
            {
                var splitted = splittedQueue.Dequeue();
                RestoreState(currentState);
                termAtt.SetTermBuffer(splitted);
                posAtt.PositionIncrement = 0;
                return(true);
            }
            //Se siamo alla fine ci fermiamo subito
            if (!input.IncrementToken())
            {
                return(false);
            }

            var currentTerm = new string(termAtt.TermBuffer(), 0, termAtt.TermLength());

            if (!string.IsNullOrEmpty(currentTerm))
            {
                var splittedWords = GetSplittedWord(currentTerm);
                //Non ci sono parole, non elaboriamo e proseguiamo
                if (splittedWords == null || splittedWords.Length == 0)
                {
                    return(true);
                }
                foreach (var splittedWord in splittedWords)
                {
                    splittedQueue.Enqueue(splittedWord);
                }
            }
            currentState = CaptureState();
            return(true);
        }
示例#16
0
        public override bool IncrementToken()
        {
            ClearAttributes();

            int length = 0;
            int start  = _bufferIndex;

            char[] buffer = _termAtt.TermBuffer();
            while (true)
            {
                if (_bufferIndex >= _dataLen)
                {
                    _offset += _dataLen;
                    _dataLen = input.Read(_ioBuffer, 0, _ioBuffer.Length);
                    if (_dataLen <= 0)
                    {
                        _dataLen = 0; // so next offset += dataLen won't decrement offset
                        if (length > 0)
                        {
                            break;
                        }

                        return(false);
                    }
                    _bufferIndex = 0;
                }

                char c = _ioBuffer[_bufferIndex++];

                if (Helper.IsTokenChar(c))
                {
                    // if it's a token char

                    if (length == 0)
                    {
                        // start of token
                        start = _offset + _bufferIndex - 1;
                    }
                    else if (length == buffer.Length)
                    {
                        buffer = _termAtt.ResizeTermBuffer(1 + length);
                    }

                    buffer[length++] = Helper.Normalize(c); // buffer it, normalized
                }
                else if (length > 0)
                {
                    // at non-Letter w/ chars
                    break; // return 'em
                }
            }

            _termAtt.SetTermLength(length);
            _offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));

            return(true);
        }
示例#17
0
 public override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         var currentTerm = new string(termAtt.TermBuffer(), 0, termAtt.TermLength());
         if (!string.IsNullOrEmpty(currentTerm))
         {
             stemmer.Stem(termAtt.TermBuffer(), termAtt.TermLength(), out char[] newTerm, out var newLength);
             termAtt.SetTermBuffer(newTerm, 0, newLength);
             termAtt.SetTermLength(newLength);
         }
         return(true);
     }
     else
     {
         return(false);
     }
 }
示例#18
0
 public override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         int newlen = normalizer.Normalize(termAtt.TermBuffer(), termAtt.TermLength());
         termAtt.SetTermLength(newlen);
         return(true);
     }
     return(false);
 }
        public override bool IncrementToken()
        {
            if (input.IncrementToken())
            {
                var newLength = _stemmer.Stem(_termAttr.TermBuffer(), _termAttr.TermLength());
                _termAttr.SetTermLength(newLength);
                return(true);
            }

            return(false);
        }
示例#20
0
 public override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         int len = termAtt.TermLength();
         if (marker != NOMARKER)
         {
             len++;
             termAtt.ResizeTermBuffer(len);
             termAtt.TermBuffer()[len - 1] = marker;
         }
         Reverse(termAtt.TermBuffer(), len);
         termAtt.SetTermLength(len);
         return(true);
     }
     else
     {
         return(false);
     }
 }
示例#21
0
 public override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         if (stemmer.Stem(termAtt.TermBuffer(), termAtt.TermLength()))
         {
             termAtt.SetTermBuffer(stemmer.ToString());
         }
         return(true);
     }
     return(false);
 }
示例#22
0
 public override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         typeAtt.Type = char.ToUpper(termAtt.TermBuffer()[0]).ToString();
         return(true);
     }
     else
     {
         return(false);
     }
 }
示例#23
0
 public override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         int newlen = stemmer.Stem(termAtt.TermBuffer(), termAtt.TermLength());
         termAtt.SetTermLength(newlen);
         return(true);
     }
     else
     {
         return(false);
     }
 }
        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                return(false);
            }

            if (stemmer.Stem(termAtt.TermBuffer(), 0, termAtt.TermLength()))
            {
                termAtt.SetTermBuffer(stemmer.ResultBuffer, 0, stemmer.ResultLength);
            }
            return(true);
        }
 private Token GetNextPrefixInputToken(Token token)
 {
     if (!Prefix.IncrementToken())
     {
         return(null);
     }
     token.SetTermBuffer(_pTermAtt.TermBuffer(), 0, _pTermAtt.TermLength());
     token.PositionIncrement = _pPosIncrAtt.PositionIncrement;
     token.Flags             = _pFlagsAtt.Flags;
     token.SetOffset(_pOffsetAtt.StartOffset, _pOffsetAtt.EndOffset);
     token.Type    = _pTypeAtt.Type;
     token.Payload = _pPayloadAtt.Payload;
     return(token);
 }
示例#26
0
 private Token GetNextToken(Token token)
 {
     if (!this.IncrementToken())
     {
         return(null);
     }
     token.SetTermBuffer(_termAtt.TermBuffer(), 0, _termAtt.TermLength());
     token.PositionIncrement = _posIncrAtt.PositionIncrement;
     token.Flags             = _flagsAtt.Flags;
     token.SetOffset(_offsetAtt.StartOffset, _offsetAtt.EndOffset);
     token.Type    = _typeAtt.Type;
     token.Payload = _payloadAtt.Payload;
     return(token);
 }
        public override bool IncrementToken()
        {
            if (input.IncrementToken())
            {
                char[] buffer       = termAtt.TermBuffer();
                int    bufferLength = termAtt.TermLength();

                if (bufferLength >= 2 && (buffer[bufferLength - 2] == '\'' || ((buffer[bufferLength - 2] == '\u2019' || buffer[bufferLength - 2] == '\uFF07'))) && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
                {
                    termAtt.SetTermLength(bufferLength - 2); // Strip last 2 characters off
                }
                return(true);
            }
            return(false);
        }
        public override bool IncrementToken()
        {
            if (input.IncrementToken())
            {
                char[] buffer = termAtt.TermBuffer();
                int    length = termAtt.TermLength();
                for (int i = 0; i < length; i++)
                {
                    buffer[i] = System.Char.ToLower(buffer[i]);
                }

                return(true);
            }
            return(false);
        }
示例#29
0
        /*
         * Increments the {@link TokenStream} with a {@link TermAttribute} without elisioned start
         */
        public override sealed bool IncrementToken()
        {
            if (input.IncrementToken())
            {
                char[] termBuffer = termAtt.TermBuffer();
                int    termLength = termAtt.TermLength();

                int minPoz = int.MaxValue;
                for (int i = 0; i < apostrophes.Length; i++)
                {
                    char apos = apostrophes[i];
                    // The equivalent of String.indexOf(ch)
                    for (int poz = 0; poz < termLength; poz++)
                    {
                        if (termBuffer[poz] == apos)
                        {
                            minPoz = Math.Min(poz, minPoz);
                            break;
                        }
                    }
                }

                // An apostrophe has been found. If the prefix is an article strip it off.
                if (minPoz != int.MaxValue &&
                    articles.Contains(termAtt.TermBuffer(), 0, minPoz))
                {
                    termAtt.SetTermBuffer(termAtt.TermBuffer(), minPoz + 1, termAtt.TermLength() - (minPoz + 1));
                }

                return(true);
            }
            else
            {
                return(false);
            }
        }
示例#30
0
        private Token GetNextInputToken(Token token)
        {
            if (!_input.IncrementToken())
            {
                return(null);
            }

            token.SetTermBuffer(_inTermAtt.TermBuffer(), 0, _inTermAtt.TermLength());
            token.PositionIncrement = _inPosIncrAtt.PositionIncrement;
            token.Flags             = _inFlagsAtt.Flags;
            token.SetOffset(_inOffsetAtt.StartOffset, _inOffsetAtt.EndOffset);
            token.Type    = _inTypeAtt.Type;
            token.Payload = _inPayloadAtt.Payload;
            return(token);
        }