Example #1
0
        public override bool IncrementToken()
        {
            if (tokenIter is null || !tokenIter.MoveNext())
            {
                // there are no remaining tokens from the current sentence... are there more sentences?
                if (m_input.IncrementToken())
                {
                    tokStart = offsetAtt.StartOffset;
                    tokEnd   = offsetAtt.EndOffset;
                    // if length by start + end offsets doesn't match the term text then assume
                    // this is a synonym and don't adjust the offsets.
                    hasIllegalOffsets = (tokStart + termAtt.Length) != tokEnd;
                    // a new sentence is available: process it.
                    tokenBuffer = wordSegmenter.SegmentSentence(termAtt.ToString(), offsetAtt.StartOffset);
                    tokenIter   = tokenBuffer.GetEnumerator();

                    /*
                     * it should not be possible to have a sentence with 0 words, check just in case.
                     * returning EOS isn't the best either, but its the behavior of the original code.
                     */
                    if (!tokenIter.MoveNext())
                    {
                        return(false);
                    }
                }
                else
                {
                    return(false); // no more sentences, end of stream!
                }
            }

            // WordTokenFilter must clear attributes, as it is creating new tokens.
            ClearAttributes();
            // There are remaining tokens from the current sentence, return the next one.
            SegToken nextWord = tokenIter.Current;

            termAtt.CopyBuffer(nextWord.CharArray, 0, nextWord.CharArray.Length);
            if (hasIllegalOffsets)
            {
                offsetAtt.SetOffset(tokStart, tokEnd);
            }
            else
            {
                offsetAtt.SetOffset(nextWord.StartOffset, nextWord.EndOffset);
            }
            typeAtt.Type = "word";
            return(true);
        }
        protected override void SetNextSentence(int sentenceStart, int sentenceEnd)
        {
            string sentence = new string(m_buffer, sentenceStart, sentenceEnd - sentenceStart);

            tokens = wordSegmenter.SegmentSentence(sentence, m_offset + sentenceStart).GetEnumerator();
        }