Exemplo n.º 1
0
        public override bool IncrementToken()
        {
            if (tokenIter is null || !tokenIter.MoveNext())
            {
                // there are no remaining tokens from the current sentence... are there more sentences?
                if (m_input.IncrementToken())
                {
                    tokStart = offsetAtt.StartOffset;
                    tokEnd   = offsetAtt.EndOffset;
                    // if length by start + end offsets doesn't match the term text then assume
                    // this is a synonym and don't adjust the offsets.
                    hasIllegalOffsets = (tokStart + termAtt.Length) != tokEnd;
                    // a new sentence is available: process it.
                    tokenBuffer = wordSegmenter.SegmentSentence(termAtt.ToString(), offsetAtt.StartOffset);
                    tokenIter   = tokenBuffer.GetEnumerator();

                    /*
                     * it should not be possible to have a sentence with 0 words, check just in case.
                     * returning EOS isn't the best either, but its the behavior of the original code.
                     */
                    if (!tokenIter.MoveNext())
                    {
                        return(false);
                    }
                }
                else
                {
                    return(false); // no more sentences, end of stream!
                }
            }

            // WordTokenFilter must clear attributes, as it is creating new tokens.
            ClearAttributes();
            // There are remaining tokens from the current sentence, return the next one.
            SegToken nextWord = tokenIter.Current;

            termAtt.CopyBuffer(nextWord.CharArray, 0, nextWord.CharArray.Length);
            if (hasIllegalOffsets)
            {
                offsetAtt.SetOffset(tokStart, tokEnd);
            }
            else
            {
                offsetAtt.SetOffset(nextWord.StartOffset, nextWord.EndOffset);
            }
            typeAtt.Type = "word";
            return(true);
        }
Exemplo n.º 2
0
 protected override bool IncrementWord()
 {
     if (tokens == null || !tokens.MoveNext())
     {
         return(false);
     }
     else
     {
         SegToken token = tokens.Current;
         ClearAttributes();
         termAtt.CopyBuffer(token.CharArray, 0, token.CharArray.Length);
         offsetAtt.SetOffset(CorrectOffset(token.StartOffset), CorrectOffset(token.EndOffset));
         typeAtt.Type = "word";
         return(true);
     }
 }
Exemplo n.º 3
0
        /// <summary>
        /// Process a <see cref="SegToken"/> so that it is ready for indexing.
        /// </summary>
        /// <param name="st">st input <see cref="SegToken"/></param>
        /// <param name="sentence">associated Sentence</param>
        /// <param name="sentenceStartOffset">offset into sentence</param>
        /// <returns>Lucene <see cref="SegToken"/></returns>
        public virtual SegToken ConvertSegToken(SegToken st, string sentence,
                                                int sentenceStartOffset)
        {
            switch (st.WordType)
            {
            case WordType.STRING:
            case WordType.NUMBER:
            case WordType.FULLWIDTH_NUMBER:
            case WordType.FULLWIDTH_STRING:
                st.CharArray = sentence.Substring(st.StartOffset, st.EndOffset - st.StartOffset)
                               .ToCharArray();
                break;

            default:
                break;
            }

            st              = tokenFilter.Filter(st);
            st.StartOffset += sentenceStartOffset;
            st.EndOffset   += sentenceStartOffset;
            return(st);
        }
Exemplo n.º 4
0
        /// <summary>
        /// Filter an input <see cref="SegToken"/>
        /// <para>
        /// Full-width latin will be converted to half-width, then all latin will be lowercased.
        /// All punctuation is converted into <see cref="Utility.COMMON_DELIMITER"/>
        /// </para>
        /// </summary>
        /// <param name="token">Input <see cref="SegToken"/>.</param>
        /// <returns>Normalized <see cref="SegToken"/>.</returns>
        public virtual SegToken Filter(SegToken token)
        {
            switch (token.WordType)
            {
            case WordType.FULLWIDTH_NUMBER:
            case WordType.FULLWIDTH_STRING:     /* first convert full-width -> half-width */
                for (int i = 0; i < token.CharArray.Length; i++)
                {
                    if (token.CharArray[i] >= 0xFF10)
                    {
                        token.CharArray[i] = (char)(token.CharArray[i] - 0xFEE0);
                    }

                    if (token.CharArray[i] >= 0x0041 && token.CharArray[i] <= 0x005A)     /* lowercase latin */
                    {
                        token.CharArray[i] = (char)(token.CharArray[i] + 0x0020);
                    }
                }
                break;

            case WordType.STRING:
                for (int i = 0; i < token.CharArray.Length; i++)
                {
                    if (token.CharArray[i] >= 0x0041 && token.CharArray[i] <= 0x005A)     /* lowercase latin */
                    {
                        token.CharArray[i] = (char)(token.CharArray[i] + 0x0020);
                    }
                }
                break;

            case WordType.DELIMITER:     /* convert all punctuation to Utility.COMMON_DELIMITER */
                token.CharArray = Utility.COMMON_DELIMITER;
                break;

            default:
                break;
            }
            return(token);
        }