Beispiel #1
0
 public sealed override bool IncrementToken()
 {
     if (m_input.IncrementToken())
     {
         string text = termAttribute.ToString();
         termAttribute.SetEmpty();
         termAttribute.Append(CyrillicLatinConverter.cir2lat(text));
         return(true);
     }
     return(false);
 }
        /// <summary>
        /// Returns the next token in the stream, or null at EOS. </summary>
        public override bool IncrementToken()
        {
            ClearAttributes();
            // if we are just starting, read the whole input
            if (!started)
            {
                started  = true;
                gramSize = minGram;
                int    limit = side == Side.FRONT ? maxGram : 1024;
                char[] chars = new char[Math.Min(1024, limit)];
                charsRead = 0;
                // TODO: refactor to a shared readFully somewhere:
                bool exhausted = false;
                while (charsRead < limit)
                {
                    int inc = input.Read(chars, charsRead, chars.Length - charsRead);
                    if (inc <= 0)
                    {
                        exhausted = true;
                        break;
                    }
                    charsRead += inc;
                    if (charsRead == chars.Length && charsRead < limit)
                    {
                        chars = ArrayUtil.Grow(chars);
                    }
                }

                inStr = new string(chars, 0, charsRead);
                inStr = inStr.Trim();

                if (!exhausted)
                {
                    // Read extra throwaway chars so that on end() we
                    // report the correct offset:
                    var throwaway = new char[1024];
                    while (true)
                    {
                        int inc = input.Read(throwaway, 0, throwaway.Length);
                        if (inc <= 0)
                        {
                            break;
                        }
                        charsRead += inc;
                    }
                }

                inLen = inStr.Length;
                if (inLen == 0)
                {
                    return(false);
                }
                posIncrAtt.PositionIncrement = 1;
            }
            else
            {
                posIncrAtt.PositionIncrement = 0;
            }

            // if the remaining input is too short, we can't generate any n-grams
            if (gramSize > inLen)
            {
                return(false);
            }

            // if we have hit the end of our n-gram size range, quit
            if (gramSize > maxGram || gramSize > inLen)
            {
                return(false);
            }

            // grab gramSize chars from front or back
            int start = side == Side.FRONT ? 0 : inLen - gramSize;
            int end   = start + gramSize;

            termAtt.SetEmpty().Append(inStr, start, end);
            offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end));
            gramSize++;
            return(true);
        }
Beispiel #3
0
        /// <summary>
        /// Returns the next token in the stream, or null at EOS. </summary>
        public override bool IncrementToken()
        {
            ClearAttributes();
            if (!started)
            {
                started  = true;
                gramSize = minGram;
                char[] chars = new char[1024];
                charsRead = 0;
                // TODO: refactor to a shared readFully somewhere:
                while (charsRead < chars.Length)
                {
                    int inc = input.Read(chars, charsRead, chars.Length - charsRead);
                    if (inc == -1)
                    {
                        break;
                    }
                    charsRead += inc;
                }
                inStr = (new string(chars, 0, charsRead)).Trim(); // remove any trailing empty strings

                if (charsRead == chars.Length)
                {
                    // Read extra throwaway chars so that on end() we
                    // report the correct offset:
                    var throwaway = new char[1024];
                    while (true)
                    {
                        int inc = input.Read(throwaway, 0, throwaway.Length);
                        if (inc == -1)
                        {
                            break;
                        }
                        charsRead += inc;
                    }
                }

                inLen = inStr.Length;
                if (inLen == 0)
                {
                    return(false);
                }
            }

            if (pos + gramSize > inLen) // if we hit the end of the string
            {
                pos = 0;                // reset to beginning of string
                gramSize++;             // increase n-gram size
                if (gramSize > maxGram) // we are done
                {
                    return(false);
                }
                if (pos + gramSize > inLen)
                {
                    return(false);
                }
            }

            int oldPos = pos;

            pos++;
            termAtt.SetEmpty().Append(inStr, oldPos, oldPos + gramSize);
            offsetAtt.SetOffset(CorrectOffset(oldPos), CorrectOffset(oldPos + gramSize));
            return(true);
        }