static void EnqueueIfUnseen(DfaMatchingState <TSet> state, HashSet <DfaMatchingState <TSet> > seen, Queue <DfaMatchingState <TSet> > queue)
 {
     if (seen.Add(state))
     {
         queue.Enqueue(state);
     }
 }
Exemple #2
0
            public DfaMatchingState <TSetType> TakeTransition(
                SymbolicRegexMatcher <TSetType> matcher, DfaMatchingState <TSetType> currentStates, int mintermId, TSetType minterm)
            {
                if (currentStates.Node.Kind != SymbolicRegexKind.Or)
                {
                    // Fall back to Brzozowski when the state is not a disjunction.
                    return(default(BrzozowskiTransition).TakeTransition(matcher, currentStates, mintermId, minterm));
                }

                SymbolicRegexBuilder <TSetType> builder = matcher._builder;

                Debug.Assert(builder._delta is not null);

                SymbolicRegexNode <TSetType> union = builder._nothing;
                uint kind = 0;

                // Produce the new list of states from the current list, considering transitions from members one at a time.
                Debug.Assert(currentStates.Node._alts is not null);
                foreach (SymbolicRegexNode <TSetType> oneState in currentStates.Node._alts)
                {
                    DfaMatchingState <TSetType> nextStates = builder.MkState(oneState, currentStates.PrevCharKind);

                    int offset = (nextStates.Id << builder._mintermsCount) | mintermId;
                    DfaMatchingState <TSetType> p = Volatile.Read(ref builder._delta[offset]) ?? matcher.CreateNewTransition(nextStates, minterm, offset);

                    // Observe that if p.Node is an Or it will be flattened.
                    union = builder.MkOr2(union, p.Node);

                    // kind is just the kind of the partition.
                    kind = p.PrevCharKind;
                }

                return(builder.MkState(union, kind, true));
            }
Exemple #3
0
        /// <summary>
        /// Compute a set of transitions for the given minterm.
        /// </summary>
        /// <param name="minterm">minterm corresponding to some input character or False corresponding to last \n</param>
        /// <returns>an enumeration of the transitions as pairs of the target state and a list of effects to be applied</returns>
        internal List <(DfaMatchingState <TSet> State, DerivativeEffect[] Effects)> NfaNextWithEffects(TSet minterm)
        {
            uint nextCharKind = GetNextCharKind(ref minterm);

            // Combined character context
            uint context = CharKind.Context(PrevCharKind, nextCharKind);

            // Compute the transitions for the given context
            List <(SymbolicRegexNode <TSet>, DerivativeEffect[])> nodesAndEffects = Node.CreateNfaDerivativeWithEffects(minterm, context);

            var list = new List <(DfaMatchingState <TSet> State, DerivativeEffect[] Effects)>();

            foreach ((SymbolicRegexNode <TSet> node, DerivativeEffect[]? effects) in nodesAndEffects)
            {
                // nextCharKind will be the PrevCharKind of the target state
                // use an existing state instead if one exists already
                // otherwise create a new new id for it
                DfaMatchingState <TSet> state = Node._builder.CreateState(node, nextCharKind, capturing: true);
                if (!state.IsDeadend)
                {
                    list.Add((state, effects));
                }
            }
            return(list);
        }
Exemple #4
0
            public DfaMatchingState <TSetType> TakeTransition(
                SymbolicRegexMatcher <TSetType> matcher, DfaMatchingState <TSetType> currentState, int mintermId, TSetType minterm)
            {
                SymbolicRegexBuilder <TSetType> builder = matcher._builder;

                Debug.Assert(builder._delta is not null);

                int offset = (currentState.Id << builder._mintermsCount) | mintermId;

                return(Volatile.Read(ref builder._delta[offset]) ?? matcher.CreateNewTransition(currentState, minterm, offset));
            }
Exemple #5
0
        private DfaMatchingState <TSetType> Delta <TTransition>(string input, int i, DfaMatchingState <TSetType> sourceState) where TTransition : struct, ITransition
        {
            TSetType[]? minterms = _builder._minterms;
            Debug.Assert(minterms is not null);

            int c = input[i];

            int mintermId = c == '\n' && i == input.Length - 1 && sourceState.StartsWithLineAnchor ?
                            minterms.Length : // mintermId = minterms.Length represents \Z (last \n)
                            _partitions.GetMintermID(c);

            TSetType minterm = (uint)mintermId < (uint)minterms.Length ?
                               minterms[mintermId] :
                               _builder._solver.False; // minterm=False represents \Z

            return(default(TTransition).TakeTransition(this, sourceState, mintermId, minterm));
        }
Exemple #6
0
        /// <summary>Walk back in reverse using the reverse pattern to find the start position of match, start position is known to exist.</summary>
        /// <param name="input">the input span</param>
        /// <param name="i">position to start walking back from, i points at the last character of the match</param>
        /// <param name="match_start_boundary">do not pass this boundary when walking back</param>
        /// <returns></returns>
        private int FindStartPosition(ReadOnlySpan <char> input, int i, int match_start_boundary)
        {
            // Fetch the correct start state for the reverse pattern.
            // This depends on previous character --- which, because going backwards, is character number i+1.
            uint prevKind = GetCharKind(input, i + 1);
            DfaMatchingState <TSetType> q = _reverseInitialStates[prevKind];

            if (i == -1)
            {
                Debug.Assert(q.IsNullable(GetCharKind(input, i)), "we reached the beginning of the input, thus the state q must be accepting");
                return(0);
            }

            int last_start = -1;

            if (q.IsNullable(GetCharKind(input, i)))
            {
                // The whole prefix of the reverse pattern was in reverse a prefix of the original pattern,
                // for example when the original pattern is concrete word such as "abc"
                last_start = i + 1;
            }

            // Walk back to the accepting state of the reverse pattern
            while (i >= match_start_boundary)
            {
                int  j    = Math.Max(match_start_boundary, i - AntimirovThresholdLeeway);
                bool done = _builder._antimirov ?
                            FindStartPositionDeltas <AntimirovTransition>(input, ref i, j, ref q, ref last_start) :
                            FindStartPositionDeltas <BrzozowskiTransition>(input, ref i, j, ref q, ref last_start);

                if (done)
                {
                    break;
                }
            }

            Debug.Assert(last_start != -1);
            return(last_start);
        }
Exemple #7
0
        /// <summary>Critical region for defining a new transition</summary>
        private DfaMatchingState <TSetType> CreateNewTransition(DfaMatchingState <TSetType> state, TSetType minterm, int offset)
        {
            Debug.Assert(_builder._delta is not null);
            lock (this)
            {
                // check if meanwhile delta[offset] has become defined possibly by another thread
                DfaMatchingState <TSetType> p = _builder._delta[offset];
                if (p is null)
                {
                    // this is the only place in code where the Next method is called in the matcher
                    _builder._delta[offset] = p = state.Next(minterm);

                    // switch to antimirov mode if the maximum bound has been reached
                    if (p.Id == AntimirovThreshold)
                    {
                        _builder._antimirov = true;
                    }
                }

                return(p);
            }
        }
Exemple #8
0
        /// <summary>Find match end position using the original pattern, end position is known to exist.</summary>
        /// <param name="input">input span</param>
        /// <param name="i">inclusive start position</param>
        /// <param name="exclusiveEnd">exclusive end position</param>
        /// <returns></returns>
        private int FindEndPosition(ReadOnlySpan <char> input, int exclusiveEnd, int i)
        {
            int i_end = exclusiveEnd;

            // Pick the correct start state based on previous character kind.
            uint prevCharKind = GetCharKind(input, i - 1);
            DfaMatchingState <TSetType> state = _initialStates[prevCharKind];

            if (state.IsNullable(GetCharKind(input, i)))
            {
                // Empty match exists because the initial state is accepting.
                i_end = i - 1;

                // Stop here if q is lazy.
                if (state.IsLazy)
                {
                    return(i_end);
                }
            }

            while (i < exclusiveEnd)
            {
                int  j    = Math.Min(exclusiveEnd, i + AntimirovThresholdLeeway);
                bool done = _builder._antimirov ?
                            FindEndPositionDeltas <AntimirovTransition>(input, ref i, j, ref state, ref i_end) :
                            FindEndPositionDeltas <BrzozowskiTransition>(input, ref i, j, ref state, ref i_end);

                if (done)
                {
                    break;
                }
            }

            Debug.Assert(i_end != exclusiveEnd);
            return(i_end);
        }
        public override void Explore(bool includeDotStarred, bool includeReverse, bool includeOriginal, bool exploreDfa, bool exploreNfa)
        {
            Debug.Assert(_builder._minterms is not null);

            // Track seen states to avoid exploring twice
            HashSet <DfaMatchingState <TSet> > seen = new();
            // Use a queue for unexplored states
            // This results in a breadth-first exploration
            Queue <DfaMatchingState <TSet> > toExplore = new();

            // Explore all initial states as requested
            if (includeDotStarred)
            {
                EnqueueAll(_dotstarredInitialStates, seen, toExplore);
            }
            if (includeReverse)
            {
                EnqueueAll(_reverseInitialStates, seen, toExplore);
            }
            if (includeOriginal)
            {
                EnqueueAll(_initialStates, seen, toExplore);
            }

            if (exploreDfa)
            {
                while (toExplore.Count > 0)
                {
                    // Don't dequeue yet, because a transition might fail
                    DfaMatchingState <TSet> state = toExplore.Peek();
                    // Include the special minterm for the last end-of-line if the state is sensitive to it
                    int maxMinterm = state.StartsWithLineAnchor ? _builder._minterms.Length : _builder._minterms.Length - 1;
                    // Explore successor states for each minterm
                    for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId)
                    {
                        int offset = (state.Id << _builder._mintermsLog) | mintermId;
                        if (!_builder.TryCreateNewTransition(state, mintermId, offset, true, out DfaMatchingState <TSet>?nextState))
                        {
                            goto DfaLimitReached;
                        }
                        EnqueueIfUnseen(nextState, seen, toExplore);
                    }
                    // Safe to dequeue now that the state has been completely handled
                    toExplore.Dequeue();
                }
            }

DfaLimitReached:
            if (exploreNfa && toExplore.Count > 0)
            {
                // DFA states are broken up into NFA states when they are alternations
                DfaMatchingState <TSet>[] toBreakUp = toExplore.ToArray();
                toExplore.Clear();
                foreach (DfaMatchingState <TSet> dfaState in toBreakUp)
                {
                    // Remove state from seen so that it can be added back in if necessary
                    seen.Remove(dfaState);
                    // Enqueue all elements of a top level alternation or the state itself
                    foreach (var element in dfaState.Node.EnumerateAlternationBranches())
                    {
                        int nfaState = _builder.CreateNfaState(element, dfaState.PrevCharKind);
                        EnqueueIfUnseen(_builder.GetCoreState(nfaState), seen, toExplore);
                    }
                }

                while (toExplore.Count > 0)
                {
                    // NFA transitions can't fail, so its safe to dequeue here
                    DfaMatchingState <TSet> state = toExplore.Dequeue();
                    // Include the special minterm for the last end-of-line if the state is sensitive to it
                    int maxMinterm = state.StartsWithLineAnchor ? _builder._minterms.Length : _builder._minterms.Length - 1;
                    // Explore successor states for each minterm
                    for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId)
                    {
                        int   nfaOffset     = (_builder._nfaStateArrayInverse[state.Id] << _builder._mintermsLog) | mintermId;
                        int[] nextNfaStates = _builder.CreateNewNfaTransition(_builder._nfaStateArrayInverse[state.Id], mintermId, nfaOffset);
                        foreach (int nextNfaState in nextNfaStates)
                        {
                            EnqueueIfUnseen(_builder.GetCoreState(nextNfaState), seen, toExplore);
                        }
                    }
                }
            }
Exemple #10
0
        private bool FindFinalStatePositionDeltas <TTransition>(ReadOnlySpan <char> input, int j, ref int i, ref DfaMatchingState <TSetType> q, ref int watchdog, out int result) where TTransition : struct, ITransition
        {
            do
            {
                // Make the transition based on input[i].
                q = Delta <TTransition>(input, i, q);

                if (q.IsNullable(GetCharKind(input, i + 1)))
                {
                    watchdog = q.WatchDog;
                    result   = i;
                    return(true);
                }

                if (q.IsNothing)
                {
                    // q is a deadend state so any further search is meaningless
                    result = NoMatchExists;
                    return(true);
                }

                // continue from the next character
                i++;
            }while (i < j && !q.IsInitialState);

            result = 0;
            return(false);
        }
Exemple #11
0
        /// <summary>Returns NoMatchExists if no match exists. Returns -1 when i=0 and the initial state is nullable.</summary>
        /// <param name="input">given input span</param>
        /// <param name="k">input length or bounded input length</param>
        /// <param name="i">start position</param>
        /// <param name="timeoutOccursAt">The time at which timeout occurs, if timeouts are being checked.</param>
        /// <param name="initialStateIndex">last position the initial state of <see cref="_dotStarredPattern"/> was visited</param>
        /// <param name="watchdog">length of match when positive</param>
        private int FindFinalStatePosition(ReadOnlySpan <char> input, int k, int i, int timeoutOccursAt, out int initialStateIndex, out int watchdog)
        {
            // Get the correct start state of the dot-star pattern, which in general depends on the previous character kind in the input.
            uint prevCharKindId           = GetCharKind(input, i - 1);
            DfaMatchingState <TSetType> q = _dotstarredInitialStates[prevCharKindId];

            initialStateIndex = i;

            if (q.IsNothing)
            {
                // If q is nothing then it is a deadend from the beginning this happens for example when the original
                // regex started with start anchor and prevCharKindId is not Start
                watchdog = -1;
                return(NoMatchExists);
            }

            if (q.IsNullable(GetCharKind(input, i)))
            {
                // The initial state is nullable in this context so at least an empty match exists.
                // The last position of the match is i-1 because the match is empty.
                // This value is -1 if i == 0.
                watchdog = -1;
                return(i - 1);
            }

            watchdog = -1;

            // Search for a match end position within input[i..k-1]
            while (i < k)
            {
                if (q.IsInitialState)
                {
                    // i_q0_A1 is the most recent position in the input when the dot-star pattern is in the initial state
                    initialStateIndex = i;

                    if (_findOpts is RegexFindOptimizations findOpts)
                    {
                        // Find the first position i that matches with some likely character.
                        if (!findOpts.TryFindNextStartingPosition(input, ref i, 0, 0, k))
                        {
                            // no match was found
                            return(NoMatchExists);
                        }

                        initialStateIndex = i;

                        // the start state must be updated
                        // to reflect the kind of the previous character
                        // when anchors are not used, q will remain the same state
                        q = _dotstarredInitialStates[GetCharKind(input, i - 1)];
                        if (q.IsNothing)
                        {
                            return(NoMatchExists);
                        }
                    }
                }

                int  result;
                int  j    = Math.Min(k, i + AntimirovThresholdLeeway);
                bool done = _builder._antimirov ?
                            FindFinalStatePositionDeltas <AntimirovTransition>(input, j, ref i, ref q, ref watchdog, out result) :
                            FindFinalStatePositionDeltas <BrzozowskiTransition>(input, j, ref i, ref q, ref watchdog, out result);

                if (done)
                {
                    return(result);
                }

                if (_checkTimeout)
                {
                    DoCheckTimeout(timeoutOccursAt);
                }
            }

            //no match was found
            return(NoMatchExists);
        }
Exemple #12
0
        private bool FindStartPositionDeltas <TTransition>(ReadOnlySpan <char> input, ref int i, int j, ref DfaMatchingState <TSetType> q, ref int last_start) where TTransition : struct, ITransition
        {
            do
            {
                q = Delta <TTransition>(input, i, q);

                // Reached a deadend state, thus the earliest match start point must have occurred already.
                if (q.IsNothing)
                {
                    return(true);
                }

                if (q.IsNullable(GetCharKind(input, i - 1)))
                {
                    // Earliest start point so far. This must happen at some point
                    // or else the dot-star pattern would not have reached a final state after match_start_boundary.
                    last_start = i;
                }

                i -= 1;
            }while (i > j);

            return(false);
        }
Exemple #13
0
        private bool FindEndPositionDeltas <TTransition>(ReadOnlySpan <char> input, ref int i, int j, ref DfaMatchingState <TSetType> q, ref int i_end) where TTransition : struct, ITransition
        {
            do
            {
                q = Delta <TTransition>(input, i, q);

                if (q.IsNullable(GetCharKind(input, i + 1)))
                {
                    // Accepting state has been reached. Record the position.
                    i_end = i;

                    // Stop here if q is lazy.
                    if (q.IsLazy)
                    {
                        return(true);
                    }
                }
                else if (q.IsDeadend)
                {
                    // Non-accepting sink state (deadend) has been reached in the original pattern.
                    // So the match ended when the last i_end was updated.
                    return(true);
                }

                i++;
            }while (i < j);

            return(false);
        }
Exemple #14
0
        /// <summary>Constructs matcher for given symbolic regex.</summary>
        internal SymbolicRegexMatcher(SymbolicRegexNode <TSetType> sr, RegexCode code, CharSetSolver css, BDD[] minterms, TimeSpan matchTimeout, CultureInfo culture)
        {
            Debug.Assert(sr._builder._solver is BV64Algebra or BVAlgebra or CharSetSolver, $"Unsupported algebra: {sr._builder._solver}");

            _pattern      = sr;
            _builder      = sr._builder;
            _checkTimeout = Regex.InfiniteMatchTimeout != matchTimeout;
            _timeout      = (int)(matchTimeout.TotalMilliseconds + 0.5); // Round up, so it will be at least 1ms
            _partitions   = _builder._solver switch
            {
                BV64Algebra bv64 => bv64._classifier,
                BVAlgebra bv => bv._classifier,
                            _ => new MintermClassifier((CharSetSolver)(object)_builder._solver, minterms),
            };

            if (code.FindOptimizations.FindMode != FindNextStartingPositionMode.NoSearch &&
                code.FindOptimizations.LeadingAnchor == 0) // If there are any anchors, we're better off letting the DFA quickly do its job of determining whether there's a match.
            {
                _findOpts = code.FindOptimizations;
            }

            // Determine the number of initial states. If there's no anchor, only the default previous
            // character kind 0 is ever going to be used for all initial states.
            int statesCount = _pattern._info.ContainsSomeAnchor ? CharKind.CharKindCount : 1;

            // Create the initial states for the original pattern.
            var initialStates = new DfaMatchingState <TSetType> [statesCount];

            for (uint i = 0; i < initialStates.Length; i++)
            {
                initialStates[i] = _builder.MkState(_pattern, i);
            }
            _initialStates = initialStates;

            // Create the dot-star pattern (a concatenation of any* with the original pattern)
            // and all of its initial states.
            _dotStarredPattern = _builder.MkConcat(_builder._anyStar, _pattern);
            var dotstarredInitialStates = new DfaMatchingState <TSetType> [statesCount];

            for (uint i = 0; i < dotstarredInitialStates.Length; i++)
            {
                // Used to detect if initial state was reentered,
                // but observe that the behavior from the state may ultimately depend on the previous
                // input char e.g. possibly causing nullability of \b or \B or of a start-of-line anchor,
                // in that sense there can be several "versions" (not more than StateCount) of the initial state.
                DfaMatchingState <TSetType> state = _builder.MkState(_dotStarredPattern, i);
                state.IsInitialState       = true;
                dotstarredInitialStates[i] = state;
            }
            _dotstarredInitialStates = dotstarredInitialStates;

            // Create the reverse pattern (the original pattern in reverse order) and all of its
            // initial states.
            _reversePattern = _pattern.Reverse();
            var reverseInitialStates = new DfaMatchingState <TSetType> [statesCount];

            for (uint i = 0; i < reverseInitialStates.Length; i++)
            {
                reverseInitialStates[i] = _builder.MkState(_reversePattern, i);
            }
            _reverseInitialStates = reverseInitialStates;

            // Initialize our fast-lookup for determining the character kind of ASCII characters.
            // This is only required when the pattern contains anchors, as otherwise there's only
            // ever a single kind used.
            if (_pattern._info.ContainsSomeAnchor)
            {
                var asciiCharKinds = new uint[128];
                for (int i = 0; i < asciiCharKinds.Length; i++)
                {
                    TSetType predicate2;
                    uint     charKind;

                    if (i == '\n')
                    {
                        predicate2 = _builder._newLinePredicate;
                        charKind   = CharKind.Newline;
                    }
                    else
                    {
                        predicate2 = _builder._wordLetterPredicateForAnchors;
                        charKind   = CharKind.WordLetter;
                    }

                    asciiCharKinds[i] = _builder._solver.And(GetMinterm(i), predicate2).Equals(_builder._solver.False) ? 0 : charKind;
                }
                _asciiCharKinds = asciiCharKinds;
            }
        }
Exemple #15
0
#pragma warning disable CA2252 // This API requires opting into preview features
            /// <summary>Find the next state given the current state and next character.</summary>
            static abstract DfaMatchingState <TSetType> TakeTransition(SymbolicRegexMatcher <TSetType> matcher, DfaMatchingState <TSetType> currentState, int mintermId, TSetType minterm);