示例#1
0
        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                return(false);
            }
            bool bufferUpdated = true;

            char[] buffer       = termAtt.TermBuffer();
            int    bufferLength = termAtt.TermLength();
            String type         = typeAtt.Type;

            if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
            {
                for (int i = 0; i < bufferLength - 2; i++)
                {
                    buffer[i] = ToLower(buffer[i]);
                }
                // Strip last 2 characters off
                termAtt.SetTermLength(bufferLength - 2);
            }
            else if (type == ACRONYM_TYPE)
            {
                // remove dots
                int upto = 0;
                for (int i = 0; i < bufferLength; i++)
                {
                    char c = buffer[i];
                    if (c != '.')
                    {
                        buffer[upto++] = ToLower(c);
                    }
                }
                termAtt.SetTermLength(upto);
            }
            else
            {
                do
                {
                    //If we consumed a stop word we need to update the buffer and its length.
                    if (!bufferUpdated)
                    {
                        bufferLength = termAtt.TermLength();
                        buffer       = termAtt.TermBuffer();
                    }

                    for (int i = 0; i < bufferLength; i++)
                    {
                        buffer[i] = ToLower(buffer[i]);
                    }
                    if (!stopWords.Contains(buffer, 0, bufferLength))
                    {
                        return(true);
                    }
                    bufferUpdated = false;
                } while (input.IncrementToken());
                return(false);
            }
            return(true);
        }
示例#2
0
        public override Boolean IncrementToken()
        {
            if (_buffer.Any())
            {
                var nextStem = _buffer.Dequeue();

                RestoreState(_savedState);
                _posIncAtt.PositionIncrement = 0;
                _termAtt.SetTermBuffer(nextStem.Stem, 0, nextStem.StemLength);
                return(true);
            }

            if (!input.IncrementToken())
            {
                return(false);
            }

            var newTerms = _dedup
                               ? _stemmer.UniqueStems(_termAtt.Term)
                               : _stemmer.Stem(_termAtt.Term);

            foreach (var newTerm in newTerms)
            {
                _buffer.Enqueue(newTerm);
            }

            if (_buffer.Count == 0)
            {
                // originaly: we do not know this word, return it unchanged
                // changed: apply SlovakStemmer on words not found in dictionary (possible named entities)
                var currentTerm = new string(_termAtt.TermBuffer(), 0, _termAtt.TermLength());
                if (!string.IsNullOrEmpty(currentTerm))
                {
                    _slovakStemmer.Stem(_termAtt.TermBuffer(), _termAtt.TermLength(), out char[] newTerm, out var newLength);
                    _termAtt.SetTermBuffer(newTerm, 0, newLength);
                    _termAtt.SetTermLength(newLength);
                }
                return(true);
            }

            var stem = _buffer.Dequeue();

            _termAtt.SetTermBuffer(stem.Stem, 0, stem.StemLength);

            if (_buffer.Count > 0)
            {
                _savedState = CaptureState();
            }

            return(true);
        }
        ///<summary>
        /// (non-Javadoc)
        /// <see cref="Lucene.Net.Analysis.TokenStream.IncrementToken()" />
        ///</summary>
        public override bool IncrementToken()
        {
            ClearAttributes();
            int posIncr = 1;

            while (true)
            {
                int tokenType = scanner.GetNextToken();

                if (tokenType == StandardTokenizerImpl.YYEOF)
                {
                    return(false);
                }

                if (scanner.Yylength() <= maxTokenLength)
                {
                    posIncrAtt.PositionIncrement = posIncr;
                    scanner.GetText(termAtt);
                    int start = scanner.Yychar();
                    offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.TermLength()));
                    // This 'if' should be removed in the next release. For now, it converts
                    // invalid acronyms to HOST. When removed, only the 'else' part should
                    // remain.
                    if (tokenType == StandardTokenizerImpl.ACRONYM_DEP)
                    {
                        if (replaceInvalidAcronym)
                        {
                            typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST];
                            termAtt.SetTermLength(termAtt.TermLength() - 1);                             // remove extra '.'
                        }
                        else
                        {
                            typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM];
                        }
                    }
                    else
                    {
                        typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[tokenType];
                    }
                    return(true);
                }
                // When we skip a too-long term, we still increment the
                // position increment
                else
                {
                    posIncr++;
                }
            }
        }
示例#4
0
 /* Returns the next token in the stream, or null at EOS. */
 public override bool IncrementToken()
 {
     while (true)
     {
         if (curTermBuffer == null)
         {
             if (!input.IncrementToken())
             {
                 return(false);
             }
             else
             {
                 curTermBuffer = (char[])termAtt.TermBuffer().Clone();
                 curTermLength = termAtt.TermLength();
                 curGramSize   = minGram;
                 curPos        = 0;
                 tokStart      = offsetAtt.StartOffset;
             }
         }
         while (curGramSize <= maxGram)
         {
             while (curPos + curGramSize <= curTermLength)
             {     // while there is input
                 ClearAttributes();
                 termAtt.SetTermBuffer(curTermBuffer, curPos, curGramSize);
                 offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
                 curPos++;
                 return(true);
             }
             curGramSize++;                         // increase n-gram size
             curPos = 0;
         }
         curTermBuffer = null;
     }
 }
        public override bool IncrementToken()
        {
            //Finchè ci sono termini nella coda ritorniamo e svuotiamo un temrine alla volta
            if (splittedQueue.Count > 0)
            {
                var splitted = splittedQueue.Dequeue();
                RestoreState(currentState);
                termAtt.SetTermBuffer(splitted);
                posAtt.PositionIncrement = 0;
                return(true);
            }
            //Se siamo alla fine ci fermiamo subito
            if (!input.IncrementToken())
            {
                return(false);
            }

            var currentTerm = new string(termAtt.TermBuffer(), 0, termAtt.TermLength());

            if (!string.IsNullOrEmpty(currentTerm))
            {
                var splittedWords = GetSplittedWord(currentTerm);
                //Non ci sono parole, non elaboriamo e proseguiamo
                if (splittedWords == null || splittedWords.Length == 0)
                {
                    return(true);
                }
                foreach (var splittedWord in splittedWords)
                {
                    splittedQueue.Enqueue(splittedWord);
                }
            }
            currentState = CaptureState();
            return(true);
        }
示例#6
0
            public override bool IncrementToken()
            {
                int positionIncrement = 0;

                if (_buffer == null || _offset >= _length)
                {
                    if (!_input.IncrementToken())
                    {
                        return(false);
                    }

                    _offset = 0;
                    _buffer = _termAttribute.TermBuffer();
                    _length = _termAttribute.TermLength();
                    positionIncrement++;
                    _offsetInStream++;
                }

                _offsetAttribute.SetOffset(_offsetInStream, _offsetInStream + 1);
                _offsetInStream++;

                positionIncrement++;
                _positionIncrementAttribute.PositionIncrement = positionIncrement;

                _termAttribute.SetTermLength(1);
                _termAttribute.SetTermBuffer(_buffer[_offset++].ToString());

                return(true);
            }
示例#7
0
    public override bool IncrementToken()
    {
        if (splittedQueue.Count > 0)
        {
            string splitted = splittedQueue.Dequeue();
            RestoreState(currentState);
            _termAttr.SetTermBuffer(splitted);
            _posAttr.PositionIncrement = 0;
            return(true);
        }
        if (!input.IncrementToken())
        {
            return(false);
        }
        var currentTerm = new string(_termAttr.TermBuffer(), 0, _termAttr.TermLength());
        IEnumerable <string> synonyms = SynonymEngine.GetSynonyms(currentTerm);

        if (synonyms == null)
        {
            return(false);
        }
        foreach (string syn in synonyms)
        {
            if (!currentTerm.Equals(syn))
            {
                splittedQueue.Enqueue(syn);
            }
        }
        return(true);
    }
示例#8
0
        public override bool IncrementToken()
        {
            while (input.IncrementToken())
            {
                char[] text       = termAtt.TermBuffer();
                int    termLength = termAtt.TermLength();

                // why not key off token type here assuming ChineseTokenizer comes first?
                if (!stopTable.Contains(text, 0, termLength))
                {
                    switch (char.GetUnicodeCategory(text[0]))
                    {
                    case UnicodeCategory.LowercaseLetter:
                    case UnicodeCategory.UppercaseLetter:
                        // English word/token should larger than 1 char.
                        if (termLength > 1)
                        {
                            return(true);
                        }
                        break;

                    case UnicodeCategory.OtherLetter:
                        // One Chinese char as one Chinese word.
                        // Chinese word extraction to be added later here.
                        return(true);
                    }
                }
            }
            return(false);
        }
        /// <summary>Returns the next token in the stream, or null at EOS.
        /// <p/>Removes <tt>'s</tt> from the end of words.
        /// <p/>Removes dots from acronyms.
        /// </summary>
        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                return(false);
            }

            char[] buffer       = termAtt.TermBuffer();
            int    bufferLength = termAtt.TermLength();

            System.String type = typeAtt.Type;

            if ((System.Object)type == (System.Object)APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
            {
                // Strip last 2 characters off
                termAtt.SetTermLength(bufferLength - 2);
            }
            else if ((System.Object)type == (System.Object)ACRONYM_TYPE)
            {
                // remove dots
                int upto = 0;
                for (int i = 0; i < bufferLength; i++)
                {
                    char c = buffer[i];
                    if (c != '.')
                    {
                        buffer[upto++] = c;
                    }
                }
                termAtt.SetTermLength(upto);
            }

            return(true);
        }
示例#10
0
        public override bool IncrementToken()
        {
            bool result = false;

            if (input.IncrementToken())
            {
                char[] buffer = termAtt.TermBuffer();
                int    length = termAtt.TermLength();
                //look for the delimiter
                bool seen = false;
                for (int i = 0; i < length; i++)
                {
                    if (buffer[i] == delimiter)
                    {
                        termAtt.SetTermBuffer(buffer, 0, i);
                        payAtt.Payload = encoder.Encode(buffer, i + 1, (length - (i + 1)));
                        seen           = true;
                        break;//at this point, we know the whole piece, so we can exit.  If we don't see the delimiter, then the termAtt is the same
                    }
                }
                if (seen == false)
                {
                    //no delimiter
                    payAtt.Payload = null;
                }
                result = true;
            }
            return(result);
        }
示例#11
0
        /// <summary>
        /// Basically is called on each token
        /// </summary>
        /// <returns></returns>
        public override bool IncrementToken()
        {
            if (input.IncrementToken())
            {
                string term        = new String(termAtt.TermBuffer(), 0, termAtt.TermLength());
                string replaceterm = BestBetsWordForms.ResourceManager.GetString(term);

                if (!String.IsNullOrEmpty(replaceterm))
                {
                    //Replace Term was a match, so we need to replace this item.
                    termAtt.SetTermBuffer(replaceterm);
                }


                char[] buffer = termAtt.TermBuffer();

                /*
                 *              int length = termAtt.TermLength();
                 *              for (int i = 0; i < length; i++)
                 *                      buffer[i] = System.Char.ToLower(buffer[i]);
                 */

                return(true);
            }
            return(false);
        }
        public sealed override bool IncrementToken()
        {
            if (tokens.Count > 0)
            {
                setToken((Token)tokens.First.Value);
                tokens.RemoveFirst();
                return(true);
            }

            if (input.IncrementToken() == false)
            {
                return(false);
            }

            wrapper.SetTermBuffer(termAtt.TermBuffer(), 0, termAtt.TermLength());
            wrapper.StartOffset       = offsetAtt.StartOffset;
            wrapper.EndOffset         = offsetAtt.EndOffset;
            wrapper.Flags             = flagsAtt.Flags;
            wrapper.Type              = typeAtt.Type;
            wrapper.PositionIncrement = posIncAtt.PositionIncrement;
            wrapper.Payload           = payloadAtt.Payload;

            Decompose(wrapper);

            if (tokens.Count > 0)
            {
                setToken(tokens.First.Value);
                tokens.RemoveFirst();
                return(true);
            }
            else
            {
                return(false);
            }
        }
示例#13
0
        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                // reached EOS -- return null
                return(false);
            }

            if (suffixByTokenType == null)
            {
                return(true);
            }

            char[] suffix;
            if (!suffixByTokenType.TryGetValue(typeAtt.Type, out suffix))
            {
                return(true);
            }

            char[] buffer = termAtt.TermBuffer();
            int    length = termAtt.TermLength();

            if (buffer.Length <= length)
            {
                buffer = termAtt.ResizeTermBuffer(length + suffix.Length);
            }

            Array.Copy(suffix, 0, buffer, length, suffix.Length);
            termAtt.SetTermLength(length + suffix.Length);

            return(true);
        }
示例#14
0
 public override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         var currentTerm = new string(termAtt.TermBuffer(), 0, termAtt.TermLength());
         if (!string.IsNullOrEmpty(currentTerm))
         {
             stemmer.Stem(termAtt.TermBuffer(), termAtt.TermLength(), out char[] newTerm, out var newLength);
             termAtt.SetTermBuffer(newTerm, 0, newLength);
             termAtt.SetTermLength(newLength);
         }
         return(true);
     }
     else
     {
         return(false);
     }
 }
示例#15
0
 public override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         int newlen = normalizer.Normalize(termAtt.TermBuffer(), termAtt.TermLength());
         termAtt.SetTermLength(newlen);
         return(true);
     }
     return(false);
 }
        public override bool IncrementToken()
        {
            if (input.IncrementToken())
            {
                var newLength = _stemmer.Stem(_termAttr.TermBuffer(), _termAttr.TermLength());
                _termAttr.SetTermLength(newLength);
                return(true);
            }

            return(false);
        }
示例#17
0
 public override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         if (stemmer.Stem(termAtt.TermBuffer(), termAtt.TermLength()))
         {
             termAtt.SetTermBuffer(stemmer.ToString());
         }
         return(true);
     }
     return(false);
 }
示例#18
0
 public override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         int newlen = stemmer.Stem(termAtt.TermBuffer(), termAtt.TermLength());
         termAtt.SetTermLength(newlen);
         return(true);
     }
     else
     {
         return(false);
     }
 }
        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                return(false);
            }

            if (stemmer.Stem(termAtt.TermBuffer(), 0, termAtt.TermLength()))
            {
                termAtt.SetTermBuffer(stemmer.ResultBuffer, 0, stemmer.ResultLength);
            }
            return(true);
        }
 private Token GetNextPrefixInputToken(Token token)
 {
     if (!Prefix.IncrementToken())
     {
         return(null);
     }
     token.SetTermBuffer(_pTermAtt.TermBuffer(), 0, _pTermAtt.TermLength());
     token.PositionIncrement = _pPosIncrAtt.PositionIncrement;
     token.Flags             = _pFlagsAtt.Flags;
     token.SetOffset(_pOffsetAtt.StartOffset, _pOffsetAtt.EndOffset);
     token.Type    = _pTypeAtt.Type;
     token.Payload = _pPayloadAtt.Payload;
     return(token);
 }
示例#21
0
 private Token GetNextToken(Token token)
 {
     if (!this.IncrementToken())
     {
         return(null);
     }
     token.SetTermBuffer(_termAtt.TermBuffer(), 0, _termAtt.TermLength());
     token.PositionIncrement = _posIncrAtt.PositionIncrement;
     token.Flags             = _flagsAtt.Flags;
     token.SetOffset(_offsetAtt.StartOffset, _offsetAtt.EndOffset);
     token.Type    = _typeAtt.Type;
     token.Payload = _payloadAtt.Payload;
     return(token);
 }
示例#22
0
        /*
         * Increments the {@link TokenStream} with a {@link TermAttribute} without elisioned start
         */
        public override sealed bool IncrementToken()
        {
            if (input.IncrementToken())
            {
                char[] termBuffer = termAtt.TermBuffer();
                int    termLength = termAtt.TermLength();

                int minPoz = int.MaxValue;
                for (int i = 0; i < apostrophes.Length; i++)
                {
                    char apos = apostrophes[i];
                    // The equivalent of String.indexOf(ch)
                    for (int poz = 0; poz < termLength; poz++)
                    {
                        if (termBuffer[poz] == apos)
                        {
                            minPoz = Math.Min(poz, minPoz);
                            break;
                        }
                    }
                }

                // An apostrophe has been found. If the prefix is an article strip it off.
                if (minPoz != int.MaxValue &&
                    articles.Contains(termAtt.TermBuffer(), 0, minPoz))
                {
                    termAtt.SetTermBuffer(termAtt.TermBuffer(), minPoz + 1, termAtt.TermLength() - (minPoz + 1));
                }

                return(true);
            }
            else
            {
                return(false);
            }
        }
        public override bool IncrementToken()
        {
            if (input.IncrementToken())
            {
                char[] buffer = termAtt.TermBuffer();
                int    length = termAtt.TermLength();
                for (int i = 0; i < length; i++)
                {
                    buffer[i] = System.Char.ToLower(buffer[i]);
                }

                return(true);
            }
            return(false);
        }
 /// <summary> Returns the next input Token whose term() is the right len</summary>
 public override bool IncrementToken()
 {
     // return the first non-stop word found
     while (input.IncrementToken())
     {
         var len = termAtt.TermLength();
         if (len >= min && len <= max)
         {
             return(true);
         }
         // note: else we ignore it but should we index each part of it?
     }
     // reached EOS -- return false
     return(false);
 }
示例#25
0
        private Token GetNextInputToken(Token token)
        {
            if (!_input.IncrementToken())
            {
                return(null);
            }

            token.SetTermBuffer(_inTermAtt.TermBuffer(), 0, _inTermAtt.TermLength());
            token.PositionIncrement = _inPosIncrAtt.PositionIncrement;
            token.Flags             = _inFlagsAtt.Flags;
            token.SetOffset(_inOffsetAtt.StartOffset, _inOffsetAtt.EndOffset);
            token.Type    = _inTypeAtt.Type;
            token.Payload = _inPayloadAtt.Payload;
            return(token);
        }
        public override bool IncrementToken()
        {
            if (input.IncrementToken())
            {
                char[] buffer       = termAtt.TermBuffer();
                int    bufferLength = termAtt.TermLength();

                if (bufferLength >= 2 && (buffer[bufferLength - 2] == '\'' || ((buffer[bufferLength - 2] == '\u2019' || buffer[bufferLength - 2] == '\uFF07'))) && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
                {
                    termAtt.SetTermLength(bufferLength - 2); // Strip last 2 characters off
                }
                return(true);
            }
            return(false);
        }
示例#27
0
 public sealed override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         char[] chArray = termAtt.TermBuffer();
         int    chLen   = termAtt.TermLength();
         for (int i = 0; i < chLen; i++)
         {
             chArray[i] = char.ToLower(chArray[i]);
         }
         return(true);
     }
     else
     {
         return(false);
     }
 }
示例#28
0
        /// <summary>
        /// {@inheritDoc}
        /// </summary>
        public override bool IncrementToken()
        {
            int skippedPositions = 0;

            while (input.IncrementToken())
            {
                string term = new string(termAtt.TermBuffer(), 0, termAtt.TermLength());

                if (!previous.Contains(term))
                {
                    previous.Add(term);
                    posIncAtt.PositionIncrement = posIncAtt.PositionIncrement + skippedPositions;
                    return(true);
                }
                skippedPositions += posIncAtt.PositionIncrement;
            }
            return(false);
        }
示例#29
0
 public override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         char[] chArray = termAtt.TermBuffer();
         int    chLen   = termAtt.TermLength();
         // TODO: iterate codepoints to support supp. characters
         for (int i = 0; i < chLen; i++)
         {
             chArray[i] = (char)lowerCase(chArray[i]);
         }
         return(true);
     }
     else
     {
         return(false);
     }
 }
示例#30
0
 public override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         int len = termAtt.TermLength();
         if (marker != NOMARKER)
         {
             len++;
             termAtt.ResizeTermBuffer(len);
             termAtt.TermBuffer()[len - 1] = marker;
         }
         Reverse(termAtt.TermBuffer(), len);
         termAtt.SetTermLength(len);
         return(true);
     }
     else
     {
         return(false);
     }
 }