Ejemplo n.º 1
0
 public override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         int newlen = stemmer.Stem(termAtt.TermBuffer(), termAtt.TermLength());
         termAtt.SetTermLength(newlen);
         return(true);
     }
     else
     {
         return(false);
     }
 }
Ejemplo n.º 2
0
        public override Boolean IncrementToken()
        {
            if (_buffer.Any())
            {
                var nextStem = _buffer.Dequeue();

                RestoreState(_savedState);
                _posIncAtt.PositionIncrement = 0;
                _termAtt.SetTermBuffer(nextStem.Stem, 0, nextStem.StemLength);
                return(true);
            }

            if (!input.IncrementToken())
            {
                return(false);
            }

            var newTerms = _dedup
                               ? _stemmer.UniqueStems(_termAtt.Term)
                               : _stemmer.Stem(_termAtt.Term);

            foreach (var newTerm in newTerms)
            {
                _buffer.Enqueue(newTerm);
            }

            if (_buffer.Count == 0)
            {
                // originaly: we do not know this word, return it unchanged
                // changed: apply SlovakStemmer on words not found in dictionary (possible named entities)
                var currentTerm = new string(_termAtt.TermBuffer(), 0, _termAtt.TermLength());
                if (!string.IsNullOrEmpty(currentTerm))
                {
                    _slovakStemmer.Stem(_termAtt.TermBuffer(), _termAtt.TermLength(), out char[] newTerm, out var newLength);
                    _termAtt.SetTermBuffer(newTerm, 0, newLength);
                    _termAtt.SetTermLength(newLength);
                }
                return(true);
            }

            var stem = _buffer.Dequeue();

            _termAtt.SetTermBuffer(stem.Stem, 0, stem.StemLength);

            if (_buffer.Count > 0)
            {
                _savedState = CaptureState();
            }

            return(true);
        }
Ejemplo n.º 3
0
 private void SetTermText(string token)
 {
     // Record the term string
     if (termAtt.TermLength() < token.Length)
     {
         termAtt.SetTermBuffer(token);
     }
     else // Perform a copy to save on memory operations
     {
         char[] buf = termAtt.TermBuffer();
         token.CopyTo(0, buf, 0, token.Length);
     }
     termAtt.SetTermLength(token.Length);
 }
        ///<summary>
        /// (non-Javadoc)
        /// <see cref="Lucene.Net.Analysis.TokenStream.IncrementToken()" />
        ///</summary>
        public override bool IncrementToken()
        {
            ClearAttributes();
            int posIncr = 1;

            while (true)
            {
                int tokenType = scanner.GetNextToken();

                if (tokenType == StandardTokenizerImpl.YYEOF)
                {
                    return(false);
                }

                if (scanner.Yylength() <= maxTokenLength)
                {
                    posIncrAtt.PositionIncrement = posIncr;
                    scanner.GetText(termAtt);
                    int start = scanner.Yychar();
                    offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.TermLength()));
                    // This 'if' should be removed in the next release. For now, it converts
                    // invalid acronyms to HOST. When removed, only the 'else' part should
                    // remain.
                    if (tokenType == StandardTokenizerImpl.ACRONYM_DEP)
                    {
                        if (replaceInvalidAcronym)
                        {
                            typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST];
                            termAtt.SetTermLength(termAtt.TermLength() - 1);                             // remove extra '.'
                        }
                        else
                        {
                            typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM];
                        }
                    }
                    else
                    {
                        typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[tokenType];
                    }
                    return(true);
                }
                // When we skip a too-long term, we still increment the
                // position increment
                else
                {
                    posIncr++;
                }
            }
        }
        public override bool IncrementToken()
        {
            if (input.IncrementToken())
            {
                char[] buffer       = termAtt.TermBuffer();
                int    bufferLength = termAtt.TermLength();

                if (bufferLength >= 2 && (buffer[bufferLength - 2] == '\'' || ((buffer[bufferLength - 2] == '\u2019' || buffer[bufferLength - 2] == '\uFF07'))) && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
                {
                    termAtt.SetTermLength(bufferLength - 2); // Strip last 2 characters off
                }
                return(true);
            }
            return(false);
        }
Ejemplo n.º 6
0
                public override bool IncrementToken()
                {
                    int count = input.Read(buffer, 0, buffer.Length);

                    if (done)
                    {
                        return(false);
                    }
                    else
                    {
                        ClearAttributes();
                        done = true;
                        if (count == 1)
                        {
                            termAtt.TermBuffer()[0] = buffer[0];
                            termAtt.SetTermLength(1);
                        }
                        else
                        {
                            termAtt.SetTermLength(0);
                        }
                        return(true);
                    }
                }
Ejemplo n.º 7
0
 public override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         var currentTerm = new string(termAtt.TermBuffer(), 0, termAtt.TermLength());
         if (!string.IsNullOrEmpty(currentTerm))
         {
             stemmer.Stem(termAtt.TermBuffer(), termAtt.TermLength(), out char[] newTerm, out var newLength);
             termAtt.SetTermBuffer(newTerm, 0, newLength);
             termAtt.SetTermLength(newLength);
         }
         return(true);
     }
     else
     {
         return(false);
     }
 }
Ejemplo n.º 8
0
 public override bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         int len = termAtt.TermLength();
         if (marker != NOMARKER)
         {
             len++;
             termAtt.ResizeTermBuffer(len);
             termAtt.TermBuffer()[len - 1] = marker;
         }
         Reverse(termAtt.TermBuffer(), len);
         termAtt.SetTermLength(len);
         return(true);
     }
     else
     {
         return(false);
     }
 }
Ejemplo n.º 9
0
        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                // reached EOS -- return null
                return(false);
            }

            // TODO: Limit this check to Hebrew Tokens only

            char[] buffer = termAtt.TermBuffer();
            int    length = termAtt.TermLength(), j = 0;

            for (int i = 0; i < length; i++)
            {
                if (buffer[i] < 1455 || buffer[i] > 1476) // current position is not a Niqqud character
                {
                    buffer[j++] = buffer[i];
                }
            }
            termAtt.SetTermLength(j);
            return(true);
        }
Ejemplo n.º 10
0
        public override bool IncrementToken()
        {
            ClearAttributes();

            int length = 0;
            int start  = bufferIndex;

            char[] ioBuffer = bufferPool.Allocate();
            try
            {
                char[] buffer = termAtt.TermBuffer();
                while (true)
                {
                    if (bufferIndex >= dataLen)
                    {
                        offset += dataLen;
                        dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
                        if (dataLen <= 0)
                        {
                            dataLen = 0; // so next offset += dataLen won't decrement offset
                            if (length > 0)
                            {
                                break;
                            }
                            return(false);
                        }
                        bufferIndex = 0;
                    }

                    char c = ioBuffer[bufferIndex++];

                    if (IsTokenChar(c))
                    {
                        // if it's a token char

                        if (length == 0)
                        {
                            // start of token
                            start = offset + bufferIndex - 1;
                        }
                        else if (length == buffer.Length)
                        {
                            buffer = termAtt.ResizeTermBuffer(1 + length);
                        }

                        buffer[length++] = Normalize(c); // buffer it, normalized
                    }
                    else if (length > 0)
                    {
                        // at non-Letter w/ chars
                        break; // return 'em
                    }
                }

                termAtt.SetTermLength(length);
                offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
            }
            finally
            {
                if (ioBuffer != null)
                {
                    bufferPool.Free(ioBuffer);
                }
            }

            return(true);
        }
Ejemplo n.º 11
0
        public override bool IncrementToken()
        {
            ClearAttributes();

            string nextToken;

            HebMorph.Tokenizer.TokenType tokenType;

            // Used to loop over certain noise cases
            while (true)
            {
                tokenType = hebMorphTokenizer.NextToken(out nextToken);
                if (tokenType == 0)
                {
                    return(false); // EOS
                }
                // Ignore "words" which are actually only prefixes in a single word.
                // This first case is easy to spot, since the prefix and the following word will be
                // separated by a dash marked as a construct (סמיכות) by the Tokenizer
                if ((tokenType & HebMorph.Tokenizer.TokenType.Construct) > 0)
                {
                    if (IsLegalPrefix(nextToken))
                    {
                        continue;
                    }
                }

                // This second case is a bit more complex. We take a risk of splitting a valid acronym or
                // abbrevated word into two, so we send it to an external function to analyze the word, and
                // get a possibly corrected word. Examples for words we expect to simplify by this operation
                // are ה"שטיח", ש"המידע.
                if ((tokenType & HebMorph.Tokenizer.TokenType.Acronym) > 0)
                {
                    nextToken = TryStrippingPrefix(nextToken);

                    // Re-detect acronym, in case it was a false positive
                    if (nextToken.IndexOf('"') == -1)
                    {
                        tokenType |= ~HebMorph.Tokenizer.TokenType.Acronym;
                    }
                }

                break;
            }

            // Record the term string
            if (termAtt.TermLength() < nextToken.Length)
            {
                termAtt.SetTermBuffer(nextToken);
            }
            else // Perform a copy to save on memory operations
            {
                char[] buf = termAtt.TermBuffer();
                nextToken.CopyTo(0, buf, 0, nextToken.Length);
            }
            termAtt.SetTermLength(nextToken.Length);

            offsetAtt.SetOffset(CorrectOffset(hebMorphTokenizer.Offset), CorrectOffset(hebMorphTokenizer.Offset + hebMorphTokenizer.LengthInSource));

            if ((tokenType & HebMorph.Tokenizer.TokenType.Hebrew) > 0)
            {
                if ((tokenType & HebMorph.Tokenizer.TokenType.Acronym) > 0)
                {
                    typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Acronym);
                }
                if ((tokenType & HebMorph.Tokenizer.TokenType.Construct) > 0)
                {
                    typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Construct);
                }
                else
                {
                    typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Hebrew);
                }
            }
            else if ((tokenType & HebMorph.Tokenizer.TokenType.Numeric) > 0)
            {
                typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.Numeric);
            }
            else
            {
                typeAtt.Type = TokenTypeSignature(TOKEN_TYPES.NonHebrew);
            }

            return(true);
        }
Ejemplo n.º 12
0
        public override bool IncrementToken()
        {
            ClearAttributes();
            int length = 0;
            int start  = _bufferIndex;

            char[] buffer = _termAtt.TermBuffer();
            while (true)
            {
                if (_bufferIndex >= _dataLen)
                {
                    _offset += _dataLen;
                    _dataLen = input.Read(_ioBuffer, 0, _ioBuffer.Length);
                    if (_dataLen <= 0)
                    {
                        _dataLen = 0;                         // so next offset += dataLen won't decrement offset
                        if (length > 0)
                        {
                            break;
                        }
                        else
                        {
                            return(false);
                        }
                    }
                    _bufferIndex = 0;
                }

                char c = _ioBuffer[_bufferIndex++];

                if (IsTokenChar(c))
                {
                    // if it's a token char

                    if (length == 0)
                    {
                        // start of token
                        start = _offset + _bufferIndex - 1;
                    }
                    else if (length == buffer.Length)
                    {
                        buffer = _termAtt.ResizeTermBuffer(1 + length);
                    }

                    buffer[length++] = Normalize(c);                     // buffer it, normalized

                    if (length == MaxWordLen)
                    {
                        // buffer overflow!
                        break;
                    }
                }
                else if (length > 0)
                {
                    // at non-Letter w/ chars
                    break;                     // return 'em
                }
            }

            _termAtt.SetTermLength(length);
            _offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
            return(true);
        }