public override bool IncrementToken() { if (tokenIter is null || !tokenIter.MoveNext()) { // there are no remaining tokens from the current sentence... are there more sentences? if (m_input.IncrementToken()) { tokStart = offsetAtt.StartOffset; tokEnd = offsetAtt.EndOffset; // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = (tokStart + termAtt.Length) != tokEnd; // a new sentence is available: process it. tokenBuffer = wordSegmenter.SegmentSentence(termAtt.ToString(), offsetAtt.StartOffset); tokenIter = tokenBuffer.GetEnumerator(); /* * it should not be possible to have a sentence with 0 words, check just in case. * returning EOS isn't the best either, but its the behavior of the original code. */ if (!tokenIter.MoveNext()) { return(false); } } else { return(false); // no more sentences, end of stream! } } // WordTokenFilter must clear attributes, as it is creating new tokens. ClearAttributes(); // There are remaining tokens from the current sentence, return the next one. SegToken nextWord = tokenIter.Current; termAtt.CopyBuffer(nextWord.CharArray, 0, nextWord.CharArray.Length); if (hasIllegalOffsets) { offsetAtt.SetOffset(tokStart, tokEnd); } else { offsetAtt.SetOffset(nextWord.StartOffset, nextWord.EndOffset); } typeAtt.Type = "word"; return(true); }
protected override void SetNextSentence(int sentenceStart, int sentenceEnd) { string sentence = new string(m_buffer, sentenceStart, sentenceEnd - sentenceStart); tokens = wordSegmenter.SegmentSentence(sentence, m_offset + sentenceStart).GetEnumerator(); }