public DfaMatchingState <TSetType> TakeTransition( SymbolicRegexMatcher <TSetType> matcher, DfaMatchingState <TSetType> currentStates, int mintermId, TSetType minterm) { if (currentStates.Node.Kind != SymbolicRegexKind.Or) { // Fall back to Brzozowski when the state is not a disjunction. return(default(BrzozowskiTransition).TakeTransition(matcher, currentStates, mintermId, minterm)); } SymbolicRegexBuilder <TSetType> builder = matcher._builder; Debug.Assert(builder._delta is not null); SymbolicRegexNode <TSetType> union = builder._nothing; uint kind = 0; // Produce the new list of states from the current list, considering transitions from members one at a time. Debug.Assert(currentStates.Node._alts is not null); foreach (SymbolicRegexNode <TSetType> oneState in currentStates.Node._alts) { DfaMatchingState <TSetType> nextStates = builder.MkState(oneState, currentStates.PrevCharKind); int offset = (nextStates.Id << builder._mintermsCount) | mintermId; DfaMatchingState <TSetType> p = Volatile.Read(ref builder._delta[offset]) ?? matcher.CreateNewTransition(nextStates, minterm, offset); // Observe that if p.Node is an Or it will be flattened. union = builder.MkOr2(union, p.Node); // kind is just the kind of the partition. kind = p.PrevCharKind; } return(builder.MkState(union, kind, true)); }
static IEnumerable <string> DescribeLabels(IEnumerable <TSet> labels, SymbolicRegexBuilder <TSet> builder) { foreach (TSet label in labels) { yield return(DescribeLabel(label, builder)); } }
/// <summary>Initializes the factory.</summary> public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, TimeSpan matchTimeout) { Debug.Assert((options & (RegexOptions.RightToLeft | RegexOptions.ECMAScript)) == 0); var charSetSolver = new CharSetSolver(); var bddBuilder = new SymbolicRegexBuilder <BDD>(charSetSolver, charSetSolver); var converter = new RegexNodeConverter(bddBuilder, regexTree.CaptureNumberSparseMapping); SymbolicRegexNode <BDD> rootNode = converter.ConvertToSymbolicRegexNode(regexTree.Root); // Determine if the root node is supported for safe handling int threshold = SymbolicRegexThresholds.GetSymbolicRegexSafeSizeThreshold(); Debug.Assert(threshold > 0); // Skip the threshold check if the threshold equals int.MaxValue if (threshold != int.MaxValue) { int size = rootNode.EstimateNfaSize(); if (size > threshold) { throw new NotSupportedException(SR.Format(SR.NotSupported_NonBacktrackingUnsafeSize, size, threshold)); } } rootNode = rootNode.AddFixedLengthMarkers(); BDD[] minterms = rootNode.ComputeMinterms(); _matcher = minterms.Length > 64 ? SymbolicRegexMatcher <BitVector> .Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new BitVectorSolver(minterms, charSetSolver), matchTimeout) : SymbolicRegexMatcher <ulong> .Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new UInt64Solver(minterms, charSetSolver), matchTimeout); }
/// <summary> /// Compute the target state for the given input minterm. /// If <paramref name="minterm"/> is False this means that this is \n and it is the last character of the input. /// </summary> /// <param name="builder">the builder that owns <see cref="Node"/></param> /// <param name="minterm">minterm corresponding to some input character or False corresponding to last \n</param> /// <param name="nextCharKind"></param> internal SymbolicRegexNode <TSet> Next(SymbolicRegexBuilder <TSet> builder, TSet minterm, uint nextCharKind) { // Combined character context uint context = CharKind.Context(PrevCharKind, nextCharKind); // Compute the derivative of the node for the given context return(Node.CreateDerivativeWithoutEffects(builder, minterm, context)); }
static string FormatInfo(SymbolicRegexBuilder <TSet> builder, int transitionCount) { StringBuilder sb = new(); sb.Append($"States = {builder._stateCache.Count} "); sb.Append($"Transitions = {transitionCount} "); sb.Append($"Min Terms ({builder._solver.GetMinterms()!.Length}) = ").AppendJoin(',', DescribeLabels(builder._solver.GetMinterms() !, builder)); return(sb.ToString()); }
public DfaMatchingState <TSetType> TakeTransition( SymbolicRegexMatcher <TSetType> matcher, DfaMatchingState <TSetType> currentState, int mintermId, TSetType minterm) { SymbolicRegexBuilder <TSetType> builder = matcher._builder; Debug.Assert(builder._delta is not null); int offset = (currentState.Id << builder._mintermsCount) | mintermId; return(Volatile.Read(ref builder._delta[offset]) ?? matcher.CreateNewTransition(currentState, minterm, offset)); }
/// <summary>Initializes the factory.</summary> public SymbolicRegexRunnerFactory(RegexCode code, RegexOptions options, TimeSpan matchTimeout, CultureInfo culture) { // RightToLeft and ECMAScript are currently not supported in conjunction with NonBacktracking. if ((options & (RegexOptions.RightToLeft | RegexOptions.ECMAScript)) != 0) { throw new NotSupportedException( SR.Format(SR.NotSupported_NonBacktrackingConflictingOption, (options & RegexOptions.RightToLeft) != 0 ? nameof(RegexOptions.RightToLeft) : nameof(RegexOptions.ECMAScript))); } var converter = new RegexNodeToSymbolicConverter(s_unicode, culture); var solver = (CharSetSolver)s_unicode._solver; SymbolicRegexNode <BDD> root = converter.Convert(code.Tree.Root, topLevel: true); _minRequiredLength = code.Tree.MinRequiredLength; BDD[] minterms = root.ComputeMinterms(); if (minterms.Length > 64) { // Use BV to represent a predicate var algBV = new BVAlgebra(solver, minterms); var builderBV = new SymbolicRegexBuilder <BV>(algBV); // The default constructor sets the following predicates to False; this update happens after the fact. // It depends on whether anchors where used in the regex whether the predicates are actually different from False. builderBV._wordLetterPredicateForAnchors = algBV.ConvertFromCharSet(solver, converter._builder._wordLetterPredicateForAnchors); builderBV._newLinePredicate = algBV.ConvertFromCharSet(solver, converter._builder._newLinePredicate); //Convert the BDD based AST to BV based AST SymbolicRegexNode <BV> rootBV = converter._builder.Transform(root, builderBV, bdd => builderBV._solver.ConvertFromCharSet(solver, bdd)); _matcher = new SymbolicRegexMatcher <BV>(rootBV, solver, minterms, matchTimeout, culture); } else { // Use ulong to represent a predicate var alg64 = new BV64Algebra(solver, minterms); var builder64 = new SymbolicRegexBuilder <ulong>(alg64) { // The default constructor sets the following predicates to False, this update happens after the fact // It depends on whether anchors where used in the regex whether the predicates are actually different from False _wordLetterPredicateForAnchors = alg64.ConvertFromCharSet(solver, converter._builder._wordLetterPredicateForAnchors), _newLinePredicate = alg64.ConvertFromCharSet(solver, converter._builder._newLinePredicate) }; // Convert the BDD-based AST to ulong-based AST SymbolicRegexNode <ulong> root64 = converter._builder.Transform(root, builder64, bdd => builder64._solver.ConvertFromCharSet(solver, bdd)); _matcher = new SymbolicRegexMatcher <ulong>(root64, solver, minterms, matchTimeout, culture); } }
/// <summary>Initializes the factory.</summary> public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, TimeSpan matchTimeout, CultureInfo culture) { Debug.Assert((options & (RegexOptions.RightToLeft | RegexOptions.ECMAScript)) == 0); var bddBuilder = new SymbolicRegexBuilder <BDD>(CharSetSolver.Instance); var converter = new RegexNodeConverter(bddBuilder, culture, regexTree.CaptureNumberSparseMapping); SymbolicRegexNode <BDD> rootNode = converter.ConvertToSymbolicRegexNode(regexTree.Root, tryCreateFixedLengthMarker: true); BDD[] minterms = rootNode.ComputeMinterms(); _matcher = minterms.Length > 64 ? SymbolicRegexMatcher <BitVector> .Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new BitVectorAlgebra(minterms), matchTimeout) : SymbolicRegexMatcher <ulong> .Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new UInt64Algebra(minterms), matchTimeout); }
/// <summary>Initializes the factory.</summary> public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, TimeSpan matchTimeout, CultureInfo culture) { // RightToLeft and ECMAScript are currently not supported in conjunction with NonBacktracking. if ((options & (RegexOptions.RightToLeft | RegexOptions.ECMAScript)) != 0) { throw new NotSupportedException( SR.Format(SR.NotSupported_NonBacktrackingConflictingOption, (options & RegexOptions.RightToLeft) != 0 ? nameof(RegexOptions.RightToLeft) : nameof(RegexOptions.ECMAScript))); } var converter = new RegexNodeConverter(culture, regexTree.CaptureNumberSparseMapping); CharSetSolver solver = CharSetSolver.Instance; SymbolicRegexNode <BDD> root = converter.ConvertToSymbolicRegexNode(regexTree.Root, tryCreateFixedLengthMarker: true); BDD[] minterms = root.ComputeMinterms(); if (minterms.Length > 64) { // Use BitVector to represent a predicate var algebra = new BitVectorAlgebra(solver, minterms); var builder = new SymbolicRegexBuilder <BitVector>(algebra) { // The default constructor sets the following predicates to False; this update happens after the fact. // It depends on whether anchors where used in the regex whether the predicates are actually different from False. _wordLetterPredicateForAnchors = algebra.ConvertFromCharSet(solver, converter._builder._wordLetterPredicateForAnchors), _newLinePredicate = algebra.ConvertFromCharSet(solver, converter._builder._newLinePredicate) }; // Convert the BDD-based AST to BitVector-based AST SymbolicRegexNode <BitVector> rootNode = converter._builder.Transform(root, builder, bdd => builder._solver.ConvertFromCharSet(solver, bdd)); _matcher = new SymbolicRegexMatcher <BitVector>(rootNode, regexTree, minterms, matchTimeout); } else { // Use ulong to represent a predicate var algebra = new BitVector64Algebra(solver, minterms); var builder = new SymbolicRegexBuilder <ulong>(algebra) { // The default constructor sets the following predicates to False, this update happens after the fact // It depends on whether anchors where used in the regex whether the predicates are actually different from False _wordLetterPredicateForAnchors = algebra.ConvertFromCharSet(solver, converter._builder._wordLetterPredicateForAnchors), _newLinePredicate = algebra.ConvertFromCharSet(solver, converter._builder._newLinePredicate) }; // Convert the BDD-based AST to ulong-based AST SymbolicRegexNode <ulong> rootNode = converter._builder.Transform(root, builder, bdd => builder._solver.ConvertFromCharSet(solver, bdd)); _matcher = new SymbolicRegexMatcher <ulong>(rootNode, regexTree, minterms, matchTimeout); } }
/// <summary>Initializes the factory.</summary> public SymbolicRegexRunnerFactory(RegexTree regexTree, RegexOptions options, TimeSpan matchTimeout) { Debug.Assert((options & (RegexOptions.RightToLeft | RegexOptions.ECMAScript)) == 0); var charSetSolver = new CharSetSolver(); var bddBuilder = new SymbolicRegexBuilder <BDD>(charSetSolver, charSetSolver); var converter = new RegexNodeConverter(bddBuilder, regexTree.CaptureNumberSparseMapping); SymbolicRegexNode <BDD> rootNode = converter.ConvertToSymbolicRegexNode(regexTree.Root); BDD[] minterms = rootNode.ComputeMinterms(); _matcher = minterms.Length > 64 ? SymbolicRegexMatcher <BitVector> .Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new BitVectorSolver(minterms, charSetSolver), matchTimeout) : SymbolicRegexMatcher <ulong> .Create(regexTree.CaptureCount, regexTree.FindOptimizations, bddBuilder, rootNode, new UInt64Solver(minterms, charSetSolver), matchTimeout); }
private TransitionRegex(SymbolicRegexBuilder <TSet> builder, TransitionRegexKind kind, TSet?test, TransitionRegex <TSet>?first, TransitionRegex <TSet>?second, SymbolicRegexNode <TSet>?node, DerivativeEffect?effect) { Debug.Assert(builder is not null); Debug.Assert( (kind is TransitionRegexKind.Leaf && node is not null && Equals(test, default(TSet)) && first is null && second is null && effect is null) || (kind is TransitionRegexKind.Conditional && test is not null && first is not null && second is not null && node is null && effect is null) || (kind is TransitionRegexKind.Union && Equals(test, default(TSet)) && first is not null && second is not null && node is null && effect is null) || (kind is TransitionRegexKind.Lookaround && Equals(test, default(TSet)) && first is not null && second is not null && node is not null && effect is null) || (kind is TransitionRegexKind.Effect && Equals(test, default(TSet)) && first is not null && second is null && node is null && effect is not null)); _builder = builder; _kind = kind; _test = test; _first = first; _second = second; _node = node; _effect = effect; }
private SymbolicRegexRunner(RegexCode code, TimeSpan matchTimeout, CultureInfo culture) { var converter = new RegexNodeToSymbolicConverter(s_unicode, culture); var solver = (CharSetSolver)s_unicode._solver; SymbolicRegexNode <BDD> root = converter.Convert(code.Tree.Root, topLevel: true); _minRequiredLength = code.Tree.MinRequiredLength; BDD[] minterms = root.ComputeMinterms(); if (minterms.Length > 64) { // Use BV to represent a predicate var algBV = new BVAlgebra(solver, minterms); var builderBV = new SymbolicRegexBuilder <BV>(algBV); // The default constructor sets the following predicates to False; this update happens after the fact. // It depends on whether anchors where used in the regex whether the predicates are actually different from False. builderBV._wordLetterPredicateForAnchors = algBV.ConvertFromCharSet(solver, converter._builder._wordLetterPredicateForAnchors); builderBV._newLinePredicate = algBV.ConvertFromCharSet(solver, converter._builder._newLinePredicate); //Convert the BDD based AST to BV based AST SymbolicRegexNode <BV> rootBV = converter._builder.Transform(root, builderBV, bdd => builderBV._solver.ConvertFromCharSet(solver, bdd)); _matcher = new SymbolicRegexMatcher <BV>(rootBV, solver, minterms, matchTimeout, culture); } else { // Use ulong to represent a predicate var alg64 = new BV64Algebra(solver, minterms); var builder64 = new SymbolicRegexBuilder <ulong>(alg64) { // The default constructor sets the following predicates to False, this update happens after the fact // It depends on whether anchors where used in the regex whether the predicates are actually different from False _wordLetterPredicateForAnchors = alg64.ConvertFromCharSet(solver, converter._builder._wordLetterPredicateForAnchors), _newLinePredicate = alg64.ConvertFromCharSet(solver, converter._builder._newLinePredicate) }; // Convert the BDD-based AST to ulong-based AST SymbolicRegexNode <ulong> root64 = converter._builder.Transform(root, builder64, bdd => builder64._solver.ConvertFromCharSet(solver, bdd)); _matcher = new SymbolicRegexMatcher <ulong>(root64, solver, minterms, matchTimeout, culture); } }
// This function gathers all transitions in the given builder and groups them by (source,destination) state ID static Dictionary <(int Source, int Target), (TSet Rule, List <int> NfaTargets)> GatherTransitions(SymbolicRegexBuilder <TSet> builder) { Debug.Assert(builder._delta is not null); Debug.Assert(builder._minterms is not null); Dictionary <(int Source, int Target), (TSet Rule, List <int> NfaTargets)> result = new(); foreach (DfaMatchingState <TSet> source in builder._stateCache) { // Get the span of entries in delta that gives the transitions for the different minterms Span <DfaMatchingState <TSet>?> deltas = builder.GetDeltasFor(source); Span <int[]?> nfaDeltas = builder.GetNfaDeltasFor(source); Debug.Assert(deltas.Length == builder._minterms.Length); for (int i = 0; i < deltas.Length; ++i) { // null entries are transitions not explored yet, so skip them if (deltas[i] is DfaMatchingState <TSet> target) { // Get or create the data for this (source,destination) state ID pair (int Source, int Target)key = (source.Id, target.Id); if (!result.TryGetValue(key, out (TSet Rule, List <int> NfaTargets)entry)) { entry = (builder._solver.Empty, new List <int>()); } // If this state has an NFA transition for the same minterm, then associate // those with the transition. if (nfaDeltas.Length > 0 && nfaDeltas[i] is int[] nfaTargets) { foreach (int nfaTarget in nfaTargets) { entry.NfaTargets.Add(builder._nfaStateArray[nfaTarget]); } } // Expand the rule for this minterm result[key] = (builder._solver.Or(entry.Rule, builder._minterms[i]), entry.NfaTargets); } } } return(result); }
/// <summary> /// Compute a set of transitions for the given minterm. /// </summary> /// <param name="builder">the builder that owns <see cref="Node"/></param> /// <param name="minterm">minterm corresponding to some input character or False corresponding to last \n</param> /// <param name="nextCharKind"></param> /// <returns>an enumeration of the transitions as pairs of the target state and a list of effects to be applied</returns> internal List <(SymbolicRegexNode <TSet> Node, DerivativeEffect[] Effects)> NfaNextWithEffects(SymbolicRegexBuilder <TSet> builder, TSet minterm, uint nextCharKind) { // Combined character context uint context = CharKind.Context(PrevCharKind, nextCharKind); // Compute the transitions for the given context return(Node.CreateNfaDerivativeWithEffects(builder, minterm, context)); }
/// <summary>Constructs matcher for given symbolic regex.</summary> internal SymbolicRegexMatcher(SymbolicRegexNode <TSetType> sr, RegexCode code, CharSetSolver css, BDD[] minterms, TimeSpan matchTimeout, CultureInfo culture) { Debug.Assert(sr._builder._solver is BV64Algebra or BVAlgebra or CharSetSolver, $"Unsupported algebra: {sr._builder._solver}"); _pattern = sr; _builder = sr._builder; _checkTimeout = Regex.InfiniteMatchTimeout != matchTimeout; _timeout = (int)(matchTimeout.TotalMilliseconds + 0.5); // Round up, so it will be at least 1ms _partitions = _builder._solver switch { BV64Algebra bv64 => bv64._classifier, BVAlgebra bv => bv._classifier, _ => new MintermClassifier((CharSetSolver)(object)_builder._solver, minterms), }; if (code.FindOptimizations.FindMode != FindNextStartingPositionMode.NoSearch && code.FindOptimizations.LeadingAnchor == 0) // If there are any anchors, we're better off letting the DFA quickly do its job of determining whether there's a match. { _findOpts = code.FindOptimizations; } // Determine the number of initial states. If there's no anchor, only the default previous // character kind 0 is ever going to be used for all initial states. int statesCount = _pattern._info.ContainsSomeAnchor ? CharKind.CharKindCount : 1; // Create the initial states for the original pattern. var initialStates = new DfaMatchingState <TSetType> [statesCount]; for (uint i = 0; i < initialStates.Length; i++) { initialStates[i] = _builder.MkState(_pattern, i); } _initialStates = initialStates; // Create the dot-star pattern (a concatenation of any* with the original pattern) // and all of its initial states. _dotStarredPattern = _builder.MkConcat(_builder._anyStar, _pattern); var dotstarredInitialStates = new DfaMatchingState <TSetType> [statesCount]; for (uint i = 0; i < dotstarredInitialStates.Length; i++) { // Used to detect if initial state was reentered, // but observe that the behavior from the state may ultimately depend on the previous // input char e.g. possibly causing nullability of \b or \B or of a start-of-line anchor, // in that sense there can be several "versions" (not more than StateCount) of the initial state. DfaMatchingState <TSetType> state = _builder.MkState(_dotStarredPattern, i); state.IsInitialState = true; dotstarredInitialStates[i] = state; } _dotstarredInitialStates = dotstarredInitialStates; // Create the reverse pattern (the original pattern in reverse order) and all of its // initial states. _reversePattern = _pattern.Reverse(); var reverseInitialStates = new DfaMatchingState <TSetType> [statesCount]; for (uint i = 0; i < reverseInitialStates.Length; i++) { reverseInitialStates[i] = _builder.MkState(_reversePattern, i); } _reverseInitialStates = reverseInitialStates; // Initialize our fast-lookup for determining the character kind of ASCII characters. // This is only required when the pattern contains anchors, as otherwise there's only // ever a single kind used. if (_pattern._info.ContainsSomeAnchor) { var asciiCharKinds = new uint[128]; for (int i = 0; i < asciiCharKinds.Length; i++) { TSetType predicate2; uint charKind; if (i == '\n') { predicate2 = _builder._newLinePredicate; charKind = CharKind.Newline; } else { predicate2 = _builder._wordLetterPredicateForAnchors; charKind = CharKind.WordLetter; } asciiCharKinds[i] = _builder._solver.And(GetMinterm(i), predicate2).Equals(_builder._solver.False) ? 0 : charKind; } _asciiCharKinds = asciiCharKinds; } }
private static TransitionRegex <TSet> GetOrCreate(SymbolicRegexBuilder <TSet> builder, TransitionRegexKind kind, TSet?test, TransitionRegex <TSet>?one, TransitionRegex <TSet>?two, SymbolicRegexNode <TSet>?node, DerivativeEffect?effect = null) { // Keep transition regexes internalized using the builder ref TransitionRegex <TSet>?tr = ref CollectionsMarshal.GetValueRefOrAddDefault(builder._trCache, (kind, test, one, two, node, effect), out _);
static string DescribeLabel(TSet label, SymbolicRegexBuilder <TSet> builder) => WebUtility.HtmlEncode(builder._solver.PrettyPrint(label, builder._charSetSolver));
/// <summary>Constructs a regex to symbolic finite automata converter</summary> public RegexNodeConverter(SymbolicRegexBuilder <BDD> builder, Hashtable?captureSparseMapping) { _builder = builder; _captureSparseMapping = captureSparseMapping; }