static void EnqueueIfUnseen(DfaMatchingState <TSet> state, HashSet <DfaMatchingState <TSet> > seen, Queue <DfaMatchingState <TSet> > queue) { if (seen.Add(state)) { queue.Enqueue(state); } }
public DfaMatchingState <TSetType> TakeTransition( SymbolicRegexMatcher <TSetType> matcher, DfaMatchingState <TSetType> currentStates, int mintermId, TSetType minterm) { if (currentStates.Node.Kind != SymbolicRegexKind.Or) { // Fall back to Brzozowski when the state is not a disjunction. return(default(BrzozowskiTransition).TakeTransition(matcher, currentStates, mintermId, minterm)); } SymbolicRegexBuilder <TSetType> builder = matcher._builder; Debug.Assert(builder._delta is not null); SymbolicRegexNode <TSetType> union = builder._nothing; uint kind = 0; // Produce the new list of states from the current list, considering transitions from members one at a time. Debug.Assert(currentStates.Node._alts is not null); foreach (SymbolicRegexNode <TSetType> oneState in currentStates.Node._alts) { DfaMatchingState <TSetType> nextStates = builder.MkState(oneState, currentStates.PrevCharKind); int offset = (nextStates.Id << builder._mintermsCount) | mintermId; DfaMatchingState <TSetType> p = Volatile.Read(ref builder._delta[offset]) ?? matcher.CreateNewTransition(nextStates, minterm, offset); // Observe that if p.Node is an Or it will be flattened. union = builder.MkOr2(union, p.Node); // kind is just the kind of the partition. kind = p.PrevCharKind; } return(builder.MkState(union, kind, true)); }
/// <summary> /// Compute a set of transitions for the given minterm. /// </summary> /// <param name="minterm">minterm corresponding to some input character or False corresponding to last \n</param> /// <returns>an enumeration of the transitions as pairs of the target state and a list of effects to be applied</returns> internal List <(DfaMatchingState <TSet> State, DerivativeEffect[] Effects)> NfaNextWithEffects(TSet minterm) { uint nextCharKind = GetNextCharKind(ref minterm); // Combined character context uint context = CharKind.Context(PrevCharKind, nextCharKind); // Compute the transitions for the given context List <(SymbolicRegexNode <TSet>, DerivativeEffect[])> nodesAndEffects = Node.CreateNfaDerivativeWithEffects(minterm, context); var list = new List <(DfaMatchingState <TSet> State, DerivativeEffect[] Effects)>(); foreach ((SymbolicRegexNode <TSet> node, DerivativeEffect[]? effects) in nodesAndEffects) { // nextCharKind will be the PrevCharKind of the target state // use an existing state instead if one exists already // otherwise create a new new id for it DfaMatchingState <TSet> state = Node._builder.CreateState(node, nextCharKind, capturing: true); if (!state.IsDeadend) { list.Add((state, effects)); } } return(list); }
public DfaMatchingState <TSetType> TakeTransition( SymbolicRegexMatcher <TSetType> matcher, DfaMatchingState <TSetType> currentState, int mintermId, TSetType minterm) { SymbolicRegexBuilder <TSetType> builder = matcher._builder; Debug.Assert(builder._delta is not null); int offset = (currentState.Id << builder._mintermsCount) | mintermId; return(Volatile.Read(ref builder._delta[offset]) ?? matcher.CreateNewTransition(currentState, minterm, offset)); }
private DfaMatchingState <TSetType> Delta <TTransition>(string input, int i, DfaMatchingState <TSetType> sourceState) where TTransition : struct, ITransition { TSetType[]? minterms = _builder._minterms; Debug.Assert(minterms is not null); int c = input[i]; int mintermId = c == '\n' && i == input.Length - 1 && sourceState.StartsWithLineAnchor ? minterms.Length : // mintermId = minterms.Length represents \Z (last \n) _partitions.GetMintermID(c); TSetType minterm = (uint)mintermId < (uint)minterms.Length ? minterms[mintermId] : _builder._solver.False; // minterm=False represents \Z return(default(TTransition).TakeTransition(this, sourceState, mintermId, minterm)); }
/// <summary>Walk back in reverse using the reverse pattern to find the start position of match, start position is known to exist.</summary> /// <param name="input">the input span</param> /// <param name="i">position to start walking back from, i points at the last character of the match</param> /// <param name="match_start_boundary">do not pass this boundary when walking back</param> /// <returns></returns> private int FindStartPosition(ReadOnlySpan <char> input, int i, int match_start_boundary) { // Fetch the correct start state for the reverse pattern. // This depends on previous character --- which, because going backwards, is character number i+1. uint prevKind = GetCharKind(input, i + 1); DfaMatchingState <TSetType> q = _reverseInitialStates[prevKind]; if (i == -1) { Debug.Assert(q.IsNullable(GetCharKind(input, i)), "we reached the beginning of the input, thus the state q must be accepting"); return(0); } int last_start = -1; if (q.IsNullable(GetCharKind(input, i))) { // The whole prefix of the reverse pattern was in reverse a prefix of the original pattern, // for example when the original pattern is concrete word such as "abc" last_start = i + 1; } // Walk back to the accepting state of the reverse pattern while (i >= match_start_boundary) { int j = Math.Max(match_start_boundary, i - AntimirovThresholdLeeway); bool done = _builder._antimirov ? FindStartPositionDeltas <AntimirovTransition>(input, ref i, j, ref q, ref last_start) : FindStartPositionDeltas <BrzozowskiTransition>(input, ref i, j, ref q, ref last_start); if (done) { break; } } Debug.Assert(last_start != -1); return(last_start); }
/// <summary>Critical region for defining a new transition</summary> private DfaMatchingState <TSetType> CreateNewTransition(DfaMatchingState <TSetType> state, TSetType minterm, int offset) { Debug.Assert(_builder._delta is not null); lock (this) { // check if meanwhile delta[offset] has become defined possibly by another thread DfaMatchingState <TSetType> p = _builder._delta[offset]; if (p is null) { // this is the only place in code where the Next method is called in the matcher _builder._delta[offset] = p = state.Next(minterm); // switch to antimirov mode if the maximum bound has been reached if (p.Id == AntimirovThreshold) { _builder._antimirov = true; } } return(p); } }
/// <summary>Find match end position using the original pattern, end position is known to exist.</summary> /// <param name="input">input span</param> /// <param name="i">inclusive start position</param> /// <param name="exclusiveEnd">exclusive end position</param> /// <returns></returns> private int FindEndPosition(ReadOnlySpan <char> input, int exclusiveEnd, int i) { int i_end = exclusiveEnd; // Pick the correct start state based on previous character kind. uint prevCharKind = GetCharKind(input, i - 1); DfaMatchingState <TSetType> state = _initialStates[prevCharKind]; if (state.IsNullable(GetCharKind(input, i))) { // Empty match exists because the initial state is accepting. i_end = i - 1; // Stop here if q is lazy. if (state.IsLazy) { return(i_end); } } while (i < exclusiveEnd) { int j = Math.Min(exclusiveEnd, i + AntimirovThresholdLeeway); bool done = _builder._antimirov ? FindEndPositionDeltas <AntimirovTransition>(input, ref i, j, ref state, ref i_end) : FindEndPositionDeltas <BrzozowskiTransition>(input, ref i, j, ref state, ref i_end); if (done) { break; } } Debug.Assert(i_end != exclusiveEnd); return(i_end); }
public override void Explore(bool includeDotStarred, bool includeReverse, bool includeOriginal, bool exploreDfa, bool exploreNfa) { Debug.Assert(_builder._minterms is not null); // Track seen states to avoid exploring twice HashSet <DfaMatchingState <TSet> > seen = new(); // Use a queue for unexplored states // This results in a breadth-first exploration Queue <DfaMatchingState <TSet> > toExplore = new(); // Explore all initial states as requested if (includeDotStarred) { EnqueueAll(_dotstarredInitialStates, seen, toExplore); } if (includeReverse) { EnqueueAll(_reverseInitialStates, seen, toExplore); } if (includeOriginal) { EnqueueAll(_initialStates, seen, toExplore); } if (exploreDfa) { while (toExplore.Count > 0) { // Don't dequeue yet, because a transition might fail DfaMatchingState <TSet> state = toExplore.Peek(); // Include the special minterm for the last end-of-line if the state is sensitive to it int maxMinterm = state.StartsWithLineAnchor ? _builder._minterms.Length : _builder._minterms.Length - 1; // Explore successor states for each minterm for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId) { int offset = (state.Id << _builder._mintermsLog) | mintermId; if (!_builder.TryCreateNewTransition(state, mintermId, offset, true, out DfaMatchingState <TSet>?nextState)) { goto DfaLimitReached; } EnqueueIfUnseen(nextState, seen, toExplore); } // Safe to dequeue now that the state has been completely handled toExplore.Dequeue(); } } DfaLimitReached: if (exploreNfa && toExplore.Count > 0) { // DFA states are broken up into NFA states when they are alternations DfaMatchingState <TSet>[] toBreakUp = toExplore.ToArray(); toExplore.Clear(); foreach (DfaMatchingState <TSet> dfaState in toBreakUp) { // Remove state from seen so that it can be added back in if necessary seen.Remove(dfaState); // Enqueue all elements of a top level alternation or the state itself foreach (var element in dfaState.Node.EnumerateAlternationBranches()) { int nfaState = _builder.CreateNfaState(element, dfaState.PrevCharKind); EnqueueIfUnseen(_builder.GetCoreState(nfaState), seen, toExplore); } } while (toExplore.Count > 0) { // NFA transitions can't fail, so its safe to dequeue here DfaMatchingState <TSet> state = toExplore.Dequeue(); // Include the special minterm for the last end-of-line if the state is sensitive to it int maxMinterm = state.StartsWithLineAnchor ? _builder._minterms.Length : _builder._minterms.Length - 1; // Explore successor states for each minterm for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId) { int nfaOffset = (_builder._nfaStateArrayInverse[state.Id] << _builder._mintermsLog) | mintermId; int[] nextNfaStates = _builder.CreateNewNfaTransition(_builder._nfaStateArrayInverse[state.Id], mintermId, nfaOffset); foreach (int nextNfaState in nextNfaStates) { EnqueueIfUnseen(_builder.GetCoreState(nextNfaState), seen, toExplore); } } } }
private bool FindFinalStatePositionDeltas <TTransition>(ReadOnlySpan <char> input, int j, ref int i, ref DfaMatchingState <TSetType> q, ref int watchdog, out int result) where TTransition : struct, ITransition { do { // Make the transition based on input[i]. q = Delta <TTransition>(input, i, q); if (q.IsNullable(GetCharKind(input, i + 1))) { watchdog = q.WatchDog; result = i; return(true); } if (q.IsNothing) { // q is a deadend state so any further search is meaningless result = NoMatchExists; return(true); } // continue from the next character i++; }while (i < j && !q.IsInitialState); result = 0; return(false); }
/// <summary>Returns NoMatchExists if no match exists. Returns -1 when i=0 and the initial state is nullable.</summary> /// <param name="input">given input span</param> /// <param name="k">input length or bounded input length</param> /// <param name="i">start position</param> /// <param name="timeoutOccursAt">The time at which timeout occurs, if timeouts are being checked.</param> /// <param name="initialStateIndex">last position the initial state of <see cref="_dotStarredPattern"/> was visited</param> /// <param name="watchdog">length of match when positive</param> private int FindFinalStatePosition(ReadOnlySpan <char> input, int k, int i, int timeoutOccursAt, out int initialStateIndex, out int watchdog) { // Get the correct start state of the dot-star pattern, which in general depends on the previous character kind in the input. uint prevCharKindId = GetCharKind(input, i - 1); DfaMatchingState <TSetType> q = _dotstarredInitialStates[prevCharKindId]; initialStateIndex = i; if (q.IsNothing) { // If q is nothing then it is a deadend from the beginning this happens for example when the original // regex started with start anchor and prevCharKindId is not Start watchdog = -1; return(NoMatchExists); } if (q.IsNullable(GetCharKind(input, i))) { // The initial state is nullable in this context so at least an empty match exists. // The last position of the match is i-1 because the match is empty. // This value is -1 if i == 0. watchdog = -1; return(i - 1); } watchdog = -1; // Search for a match end position within input[i..k-1] while (i < k) { if (q.IsInitialState) { // i_q0_A1 is the most recent position in the input when the dot-star pattern is in the initial state initialStateIndex = i; if (_findOpts is RegexFindOptimizations findOpts) { // Find the first position i that matches with some likely character. if (!findOpts.TryFindNextStartingPosition(input, ref i, 0, 0, k)) { // no match was found return(NoMatchExists); } initialStateIndex = i; // the start state must be updated // to reflect the kind of the previous character // when anchors are not used, q will remain the same state q = _dotstarredInitialStates[GetCharKind(input, i - 1)]; if (q.IsNothing) { return(NoMatchExists); } } } int result; int j = Math.Min(k, i + AntimirovThresholdLeeway); bool done = _builder._antimirov ? FindFinalStatePositionDeltas <AntimirovTransition>(input, j, ref i, ref q, ref watchdog, out result) : FindFinalStatePositionDeltas <BrzozowskiTransition>(input, j, ref i, ref q, ref watchdog, out result); if (done) { return(result); } if (_checkTimeout) { DoCheckTimeout(timeoutOccursAt); } } //no match was found return(NoMatchExists); }
private bool FindStartPositionDeltas <TTransition>(ReadOnlySpan <char> input, ref int i, int j, ref DfaMatchingState <TSetType> q, ref int last_start) where TTransition : struct, ITransition { do { q = Delta <TTransition>(input, i, q); // Reached a deadend state, thus the earliest match start point must have occurred already. if (q.IsNothing) { return(true); } if (q.IsNullable(GetCharKind(input, i - 1))) { // Earliest start point so far. This must happen at some point // or else the dot-star pattern would not have reached a final state after match_start_boundary. last_start = i; } i -= 1; }while (i > j); return(false); }
private bool FindEndPositionDeltas <TTransition>(ReadOnlySpan <char> input, ref int i, int j, ref DfaMatchingState <TSetType> q, ref int i_end) where TTransition : struct, ITransition { do { q = Delta <TTransition>(input, i, q); if (q.IsNullable(GetCharKind(input, i + 1))) { // Accepting state has been reached. Record the position. i_end = i; // Stop here if q is lazy. if (q.IsLazy) { return(true); } } else if (q.IsDeadend) { // Non-accepting sink state (deadend) has been reached in the original pattern. // So the match ended when the last i_end was updated. return(true); } i++; }while (i < j); return(false); }
/// <summary>Constructs matcher for given symbolic regex.</summary> internal SymbolicRegexMatcher(SymbolicRegexNode <TSetType> sr, RegexCode code, CharSetSolver css, BDD[] minterms, TimeSpan matchTimeout, CultureInfo culture) { Debug.Assert(sr._builder._solver is BV64Algebra or BVAlgebra or CharSetSolver, $"Unsupported algebra: {sr._builder._solver}"); _pattern = sr; _builder = sr._builder; _checkTimeout = Regex.InfiniteMatchTimeout != matchTimeout; _timeout = (int)(matchTimeout.TotalMilliseconds + 0.5); // Round up, so it will be at least 1ms _partitions = _builder._solver switch { BV64Algebra bv64 => bv64._classifier, BVAlgebra bv => bv._classifier, _ => new MintermClassifier((CharSetSolver)(object)_builder._solver, minterms), }; if (code.FindOptimizations.FindMode != FindNextStartingPositionMode.NoSearch && code.FindOptimizations.LeadingAnchor == 0) // If there are any anchors, we're better off letting the DFA quickly do its job of determining whether there's a match. { _findOpts = code.FindOptimizations; } // Determine the number of initial states. If there's no anchor, only the default previous // character kind 0 is ever going to be used for all initial states. int statesCount = _pattern._info.ContainsSomeAnchor ? CharKind.CharKindCount : 1; // Create the initial states for the original pattern. var initialStates = new DfaMatchingState <TSetType> [statesCount]; for (uint i = 0; i < initialStates.Length; i++) { initialStates[i] = _builder.MkState(_pattern, i); } _initialStates = initialStates; // Create the dot-star pattern (a concatenation of any* with the original pattern) // and all of its initial states. _dotStarredPattern = _builder.MkConcat(_builder._anyStar, _pattern); var dotstarredInitialStates = new DfaMatchingState <TSetType> [statesCount]; for (uint i = 0; i < dotstarredInitialStates.Length; i++) { // Used to detect if initial state was reentered, // but observe that the behavior from the state may ultimately depend on the previous // input char e.g. possibly causing nullability of \b or \B or of a start-of-line anchor, // in that sense there can be several "versions" (not more than StateCount) of the initial state. DfaMatchingState <TSetType> state = _builder.MkState(_dotStarredPattern, i); state.IsInitialState = true; dotstarredInitialStates[i] = state; } _dotstarredInitialStates = dotstarredInitialStates; // Create the reverse pattern (the original pattern in reverse order) and all of its // initial states. _reversePattern = _pattern.Reverse(); var reverseInitialStates = new DfaMatchingState <TSetType> [statesCount]; for (uint i = 0; i < reverseInitialStates.Length; i++) { reverseInitialStates[i] = _builder.MkState(_reversePattern, i); } _reverseInitialStates = reverseInitialStates; // Initialize our fast-lookup for determining the character kind of ASCII characters. // This is only required when the pattern contains anchors, as otherwise there's only // ever a single kind used. if (_pattern._info.ContainsSomeAnchor) { var asciiCharKinds = new uint[128]; for (int i = 0; i < asciiCharKinds.Length; i++) { TSetType predicate2; uint charKind; if (i == '\n') { predicate2 = _builder._newLinePredicate; charKind = CharKind.Newline; } else { predicate2 = _builder._wordLetterPredicateForAnchors; charKind = CharKind.WordLetter; } asciiCharKinds[i] = _builder._solver.And(GetMinterm(i), predicate2).Equals(_builder._solver.False) ? 0 : charKind; } _asciiCharKinds = asciiCharKinds; } }
#pragma warning disable CA2252 // This API requires opting into preview features /// <summary>Find the next state given the current state and next character.</summary> static abstract DfaMatchingState <TSetType> TakeTransition(SymbolicRegexMatcher <TSetType> matcher, DfaMatchingState <TSetType> currentState, int mintermId, TSetType minterm);