Exemplo n.º 1
0
 /// <summary>
 /// This method is called by a consumer before it begins consumption using
 /// <see cref="TokenStream.IncrementToken()"/>.
 /// <para/>
 /// Resets this stream to a clean state. Stateful implementations must implement
 /// this method so that they can be reused, just as if they had been created fresh.
 /// <para/>
 /// If you override this method, always call <c>base.Reset()</c>, otherwise
 /// some internal state will not be correctly reset (e.g., <see cref="Tokenizer"/> will
 /// throw <see cref="InvalidOperationException"/> on further usage).
 /// </summary>
 /// <remarks>
 /// <b>NOTE:</b>
 /// The default implementation chains the call to the input <see cref="TokenStream"/>, so
 /// be sure to call <c>base.Reset()</c> when overriding this method.
 /// </remarks>
 public override void Reset()
 {
     m_input.Reset();
 }
        /// <summary>
        /// Pulls the graph (including <see cref="IPositionLengthAttribute"/>
        /// from the provided <see cref="TokenStream"/>, and creates the corresponding
        /// automaton where arcs are bytes (or Unicode code points
        /// if unicodeArcs = true) from each term.
        /// </summary>
        public virtual Automaton ToAutomaton(TokenStream @in)
        {
            var  a             = new Automaton();
            bool deterministic = true;

            var posIncAtt    = @in.AddAttribute <IPositionIncrementAttribute>();
            var posLengthAtt = @in.AddAttribute <IPositionLengthAttribute>();
            var offsetAtt    = @in.AddAttribute <IOffsetAttribute>();
            var termBytesAtt = @in.AddAttribute <ITermToBytesRefAttribute>();

            BytesRef term = termBytesAtt.BytesRef;

            @in.Reset();

            // Only temporarily holds states ahead of our current
            // position:

            RollingBuffer <Position> positions = new Positions();

            int      pos       = -1;
            Position posData   = null;
            int      maxOffset = 0;

            while (@in.IncrementToken())
            {
                int posInc = posIncAtt.PositionIncrement;
                if (!preservePositionIncrements && posInc > 1)
                {
                    posInc = 1;
                }
                Debug.Assert(pos > -1 || posInc > 0);

                if (posInc > 0)
                {
                    // New node:
                    pos += posInc;

                    posData = positions.Get(pos);
                    Debug.Assert(posData.leaving == null);

                    if (posData.arriving == null)
                    {
                        // No token ever arrived to this position
                        if (pos == 0)
                        {
                            // OK: this is the first token
                            posData.leaving = a.GetInitialState();
                        }
                        else
                        {
                            // this means there's a hole (eg, StopFilter
                            // does this):
                            posData.leaving = new State();
                            AddHoles(a.GetInitialState(), positions, pos);
                        }
                    }
                    else
                    {
                        posData.leaving = new State();
                        posData.arriving.AddTransition(new Transition(POS_SEP, posData.leaving));
                        if (posInc > 1)
                        {
                            // A token spanned over a hole; add holes
                            // "under" it:
                            AddHoles(a.GetInitialState(), positions, pos);
                        }
                    }
                    positions.FreeBefore(pos);
                }
                else
                {
                    // note: this isn't necessarily true. its just that we aren't surely det.
                    // we could optimize this further (e.g. buffer and sort synonyms at a position)
                    // but thats probably overkill. this is cheap and dirty
                    deterministic = false;
                }

                int endPos = pos + posLengthAtt.PositionLength;

                termBytesAtt.FillBytesRef();
                BytesRef termUTF8    = ChangeToken(term);
                int[]    termUnicode = null;
                Position endPosData  = positions.Get(endPos);
                if (endPosData.arriving == null)
                {
                    endPosData.arriving = new State();
                }

                State state   = posData.leaving;
                int   termLen = termUTF8.Length;
                if (unicodeArcs)
                {
                    string utf16 = termUTF8.Utf8ToString();
                    termUnicode = new int[utf16.CodePointCount(0, utf16.Length)];
                    termLen     = termUnicode.Length;
                    for (int cp, i = 0, j = 0; i < utf16.Length; i += Character.CharCount(cp))
                    {
                        termUnicode[j++] = cp = Character.CodePointAt(utf16, i);
                    }
                }
                else
                {
                    termLen = termUTF8.Length;
                }

                for (int byteIDX = 0; byteIDX < termLen; byteIDX++)
                {
                    State nextState = byteIDX == termLen - 1 ? endPosData.arriving : new State();
                    int   c;
                    if (unicodeArcs)
                    {
                        c = termUnicode[byteIDX];
                    }
                    else
                    {
                        c = termUTF8.Bytes[termUTF8.Offset + byteIDX] & 0xff;
                    }
                    state.AddTransition(new Transition(c, nextState));
                    state = nextState;
                }

                maxOffset = Math.Max(maxOffset, offsetAtt.EndOffset);
            }

            @in.End();
            State endState = null;

            if (offsetAtt.EndOffset > maxOffset)
            {
                endState        = new State();
                endState.Accept = true;
            }

            pos++;
            while (pos <= positions.MaxPos)
            {
                posData = positions.Get(pos);
                if (posData.arriving != null)
                {
                    if (endState != null)
                    {
                        posData.arriving.AddTransition(new Transition(POS_SEP, endState));
                    }
                    else
                    {
                        posData.arriving.Accept = true;
                    }
                }
                pos++;
            }

            //toDot(a);
            a.IsDeterministic = deterministic;
            return(a);
        }