public override bool IncrementToken() { if (tokenIter is null || !tokenIter.MoveNext()) { // there are no remaining tokens from the current sentence... are there more sentences? if (m_input.IncrementToken()) { tokStart = offsetAtt.StartOffset; tokEnd = offsetAtt.EndOffset; // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = (tokStart + termAtt.Length) != tokEnd; // a new sentence is available: process it. tokenBuffer = wordSegmenter.SegmentSentence(termAtt.ToString(), offsetAtt.StartOffset); tokenIter = tokenBuffer.GetEnumerator(); /* * it should not be possible to have a sentence with 0 words, check just in case. * returning EOS isn't the best either, but its the behavior of the original code. */ if (!tokenIter.MoveNext()) { return(false); } } else { return(false); // no more sentences, end of stream! } } // WordTokenFilter must clear attributes, as it is creating new tokens. ClearAttributes(); // There are remaining tokens from the current sentence, return the next one. SegToken nextWord = tokenIter.Current; termAtt.CopyBuffer(nextWord.CharArray, 0, nextWord.CharArray.Length); if (hasIllegalOffsets) { offsetAtt.SetOffset(tokStart, tokEnd); } else { offsetAtt.SetOffset(nextWord.StartOffset, nextWord.EndOffset); } typeAtt.Type = "word"; return(true); }
protected override bool IncrementWord() { if (tokens == null || !tokens.MoveNext()) { return(false); } else { SegToken token = tokens.Current; ClearAttributes(); termAtt.CopyBuffer(token.CharArray, 0, token.CharArray.Length); offsetAtt.SetOffset(CorrectOffset(token.StartOffset), CorrectOffset(token.EndOffset)); typeAtt.Type = "word"; return(true); } }
/// <summary> /// Process a <see cref="SegToken"/> so that it is ready for indexing. /// </summary> /// <param name="st">st input <see cref="SegToken"/></param> /// <param name="sentence">associated Sentence</param> /// <param name="sentenceStartOffset">offset into sentence</param> /// <returns>Lucene <see cref="SegToken"/></returns> public virtual SegToken ConvertSegToken(SegToken st, string sentence, int sentenceStartOffset) { switch (st.WordType) { case WordType.STRING: case WordType.NUMBER: case WordType.FULLWIDTH_NUMBER: case WordType.FULLWIDTH_STRING: st.CharArray = sentence.Substring(st.StartOffset, st.EndOffset - st.StartOffset) .ToCharArray(); break; default: break; } st = tokenFilter.Filter(st); st.StartOffset += sentenceStartOffset; st.EndOffset += sentenceStartOffset; return(st); }
/// <summary> /// Filter an input <see cref="SegToken"/> /// <para> /// Full-width latin will be converted to half-width, then all latin will be lowercased. /// All punctuation is converted into <see cref="Utility.COMMON_DELIMITER"/> /// </para> /// </summary> /// <param name="token">Input <see cref="SegToken"/>.</param> /// <returns>Normalized <see cref="SegToken"/>.</returns> public virtual SegToken Filter(SegToken token) { switch (token.WordType) { case WordType.FULLWIDTH_NUMBER: case WordType.FULLWIDTH_STRING: /* first convert full-width -> half-width */ for (int i = 0; i < token.CharArray.Length; i++) { if (token.CharArray[i] >= 0xFF10) { token.CharArray[i] = (char)(token.CharArray[i] - 0xFEE0); } if (token.CharArray[i] >= 0x0041 && token.CharArray[i] <= 0x005A) /* lowercase latin */ { token.CharArray[i] = (char)(token.CharArray[i] + 0x0020); } } break; case WordType.STRING: for (int i = 0; i < token.CharArray.Length; i++) { if (token.CharArray[i] >= 0x0041 && token.CharArray[i] <= 0x005A) /* lowercase latin */ { token.CharArray[i] = (char)(token.CharArray[i] + 0x0020); } } break; case WordType.DELIMITER: /* convert all punctuation to Utility.COMMON_DELIMITER */ token.CharArray = Utility.COMMON_DELIMITER; break; default: break; } return(token); }