Пример #1
0
 /// <summary>
 /// checks condition of the concatenation of two strings </summary>
 // note: this is pretty stupid, we really should subtract strip from the condition up front and just check the stem
 // but this is a little bit more complicated.
 private bool CheckCondition(int condition, char[] c1, int c1off, int c1len, char[] c2, int c2off, int c2len)
 {
     if (condition != 0)
     {
         CharacterRunAutomaton pattern = dictionary.patterns[condition];
         int state = pattern.InitialState;
         for (int i = c1off; i < c1off + c1len; i++)
         {
             state = pattern.Step(state, c1[i]);
             if (state == -1)
             {
                 return(false);
             }
         }
         for (int i = c2off; i < c2off + c2len; i++)
         {
             state = pattern.Step(state, c2[i]);
             if (state == -1)
             {
                 return(false);
             }
         }
         return(pattern.IsAccept(state));
     }
     return(true);
 }
Пример #2
0
        public sealed override bool IncrementToken()
        {
            //Debug.Assert(!EnableChecks_Renamed || (StreamState == State.RESET || StreamState == State.INCREMENT), "IncrementToken() called while in wrong state: " + StreamState);
            ClearAttributes();
            for (; ;)
            {
                int startOffset;
                int cp;
                if (BufferedCodePoint >= 0)
                {
                    cp                = BufferedCodePoint;
                    startOffset       = BufferedOff;
                    BufferedCodePoint = -1;
                }
                else
                {
                    startOffset = Off;
                    cp          = ReadCodePoint();
                }
                if (cp < 0)
                {
                    break;
                }
                else if (IsTokenChar(cp))
                {
                    int endOffset;
                    do
                    {
                        char[] chars = Character.ToChars(Normalize(cp));
                        for (int i = 0; i < chars.Length; i++)
                        {
                            TermAtt.Append(chars[i]);
                        }
                        endOffset = Off;
                        if (TermAtt.Length >= MaxTokenLength)
                        {
                            break;
                        }
                        cp = ReadCodePoint();
                    } while (cp >= 0 && IsTokenChar(cp));

                    if (TermAtt.Length < MaxTokenLength)
                    {
                        // buffer up, in case the "rejected" char can start a new word of its own
                        BufferedCodePoint = cp;
                        BufferedOff       = endOffset;
                    }
                    else
                    {
                        // otherwise, its because we hit term limit.
                        BufferedCodePoint = -1;
                    }
                    int correctedStartOffset = CorrectOffset(startOffset);
                    int correctedEndOffset   = CorrectOffset(endOffset);
                    Assert.True(correctedStartOffset >= 0);
                    Assert.True(correctedEndOffset >= 0);
                    Assert.True(correctedStartOffset >= LastOffset);
                    LastOffset = correctedStartOffset;
                    Assert.True(correctedEndOffset >= correctedStartOffset);
                    OffsetAtt.SetOffset(correctedStartOffset, correctedEndOffset);
                    if (state == -1 || RunAutomaton.IsAccept(state))
                    {
                        // either we hit a reject state (longest match), or end-of-text, but in an accept state
                        StreamState = State.INCREMENT;
                        return(true);
                    }
                }
            }
            StreamState = State.INCREMENT_FALSE;
            return(false);
        }
Пример #3
0
        /// <summary>
        /// Consumers use this method to advance the stream to
        /// the next token. Implementing classes must implement this method and update
        /// the appropriate with the attributes of the next
        /// token.
        /// The producer must make no assumptions about the attributes after the method
        /// has been returned: the caller may arbitrarily change it. If the producer
        /// needs to preserve the state for subsequent calls, it can use
        /// to create a copy of the current attribute state
        /// this method is called for every token of a document, so an efficient
        /// implementation is crucial for good performance. To avoid calls to,
        /// references to all that this stream uses should be
        /// retrieved during instantiation.
        /// To ensure that filters and consumers know which attributes are available,
        /// the attributes must be added during instantiation. Filters and consumers
        /// are not required to check for availability of attributes.
        /// </summary>
        /// <returns> false for end of stream; true otherwise </returns>
        public override bool IncrementToken()
        {
            if (base.input != null)
            {
                if (base.input.Peek() > -1)
                {
                    //ClearAttributes();
                    //int length = 0;
                    //int start = bufferIndex;
                    //char[] buffer = TermAtt.Buffer();
                    //while (true)
                    //{

                    //    if (bufferIndex >= dataLen)
                    //    {
                    //        offset += dataLen;
                    //        dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
                    //        if (dataLen <= 0)
                    //        {
                    //            dataLen = 0; // so next offset += dataLen won't decrement offset
                    //            if (length > 0)
                    //                break;
                    //            return false;
                    //        }
                    //        bufferIndex = 0;
                    //    }

                    //    char c = ioBuffer[bufferIndex++];

                    //    if (IsTokenChar(c))
                    //    {
                    //        // if it's a token char

                    //        if (length == 0)
                    //            // start of token
                    //            start = offset + bufferIndex - 1;
                    //        else if (length == buffer.Length)
                    //            buffer = TermAtt.ResizeBuffer(1 + length);

                    //        buffer[length++] = Normalize(c); // buffer it, normalized

                    //        if (length == MAX_WORD_LEN)
                    //            // buffer overflow!
                    //            break;
                    //    }
                    //    else if (length > 0)
                    //        // at non-Letter w/ chars
                    //        break; // return 'em
                    //}

                    //TermAtt.Length = length;
                    //OffsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
                    //return true;

                    ClearAttributes();
                    for (; ;)
                    {
                        int startOffset;
                        int cp;
                        if (BufferedCodePoint >= 0)
                        {
                            cp                = BufferedCodePoint;
                            startOffset       = BufferedOff;
                            BufferedCodePoint = -1;
                        }
                        else
                        {
                            startOffset = Off;
                            cp          = ReadCodePoint();
                        }
                        if (cp < 0)
                        {
                            break;
                        }
                        else if (IsTokenChar(cp))
                        {
                            int endOffset;
                            do
                            {
                                char[] chars = Character.ToChars(Normalize(cp));
                                for (int i = 0; i < chars.Length; i++)
                                {
                                    TermAtt.Append(chars[i]);
                                }
                                endOffset = Off;
                                if (TermAtt.Length >= MaxTokenLength)
                                {
                                    break;
                                }
                                cp = ReadCodePoint();
                            } while (cp >= 0 && IsTokenChar(cp));

                            if (TermAtt.Length < MaxTokenLength)
                            {
                                // buffer up, in case the "rejected" char can start a new word of its own
                                BufferedCodePoint = cp;
                                BufferedOff       = endOffset;
                            }
                            else
                            {
                                // otherwise, its because we hit term limit.
                                BufferedCodePoint = -1;
                            }
                            int correctedStartOffset = CorrectOffset(startOffset);
                            int correctedEndOffset   = CorrectOffset(endOffset);
                            LastOffset = correctedStartOffset;
                            OffsetAtt.SetOffset(correctedStartOffset, correctedEndOffset);
                            if (state == -1 || RunAutomaton.IsAccept(state))
                            {
                                // either we hit a reject state (longest match), or end-of-text, but in an accept state
                                StreamState = ReadState.INCREMENT;
                                return(true);
                            }
                        }
                    }
                    StreamState = ReadState.INCREMENT_FALSE;
                    return(false);
                }
                else
                {
                    StreamState = ReadState.INCREMENT_FALSE;
                    return(false);
                }
            }
            else
            {
                StreamState = ReadState.INCREMENT_FALSE;
                return(false);
            }
        }