/// <summary>Initializes the factory.</summary> public SymbolicRegexRunnerFactory(RegexCode code, RegexOptions options, TimeSpan matchTimeout, CultureInfo culture) { // RightToLeft and ECMAScript are currently not supported in conjunction with NonBacktracking. if ((options & (RegexOptions.RightToLeft | RegexOptions.ECMAScript)) != 0) { throw new NotSupportedException( SR.Format(SR.NotSupported_NonBacktrackingConflictingOption, (options & RegexOptions.RightToLeft) != 0 ? nameof(RegexOptions.RightToLeft) : nameof(RegexOptions.ECMAScript))); } var converter = new RegexNodeToSymbolicConverter(s_unicode, culture); var solver = (CharSetSolver)s_unicode._solver; SymbolicRegexNode <BDD> root = converter.Convert(code.Tree.Root, topLevel: true); _minRequiredLength = code.Tree.MinRequiredLength; BDD[] minterms = root.ComputeMinterms(); if (minterms.Length > 64) { // Use BV to represent a predicate var algBV = new BVAlgebra(solver, minterms); var builderBV = new SymbolicRegexBuilder <BV>(algBV); // The default constructor sets the following predicates to False; this update happens after the fact. // It depends on whether anchors where used in the regex whether the predicates are actually different from False. builderBV._wordLetterPredicateForAnchors = algBV.ConvertFromCharSet(solver, converter._builder._wordLetterPredicateForAnchors); builderBV._newLinePredicate = algBV.ConvertFromCharSet(solver, converter._builder._newLinePredicate); //Convert the BDD based AST to BV based AST SymbolicRegexNode <BV> rootBV = converter._builder.Transform(root, builderBV, bdd => builderBV._solver.ConvertFromCharSet(solver, bdd)); _matcher = new SymbolicRegexMatcher <BV>(rootBV, solver, minterms, matchTimeout, culture); } else { // Use ulong to represent a predicate var alg64 = new BV64Algebra(solver, minterms); var builder64 = new SymbolicRegexBuilder <ulong>(alg64) { // The default constructor sets the following predicates to False, this update happens after the fact // It depends on whether anchors where used in the regex whether the predicates are actually different from False _wordLetterPredicateForAnchors = alg64.ConvertFromCharSet(solver, converter._builder._wordLetterPredicateForAnchors), _newLinePredicate = alg64.ConvertFromCharSet(solver, converter._builder._newLinePredicate) }; // Convert the BDD-based AST to ulong-based AST SymbolicRegexNode <ulong> root64 = converter._builder.Transform(root, builder64, bdd => builder64._solver.ConvertFromCharSet(solver, bdd)); _matcher = new SymbolicRegexMatcher <ulong>(root64, solver, minterms, matchTimeout, culture); } }
private SymbolicRegexRunner(RegexCode code, TimeSpan matchTimeout, CultureInfo culture) { var converter = new RegexNodeToSymbolicConverter(s_unicode, culture); var solver = (CharSetSolver)s_unicode._solver; SymbolicRegexNode <BDD> root = converter.Convert(code.Tree.Root, topLevel: true); _minRequiredLength = code.Tree.MinRequiredLength; BDD[] minterms = root.ComputeMinterms(); if (minterms.Length > 64) { // Use BV to represent a predicate var algBV = new BVAlgebra(solver, minterms); var builderBV = new SymbolicRegexBuilder <BV>(algBV); // The default constructor sets the following predicates to False; this update happens after the fact. // It depends on whether anchors where used in the regex whether the predicates are actually different from False. builderBV._wordLetterPredicateForAnchors = algBV.ConvertFromCharSet(solver, converter._builder._wordLetterPredicateForAnchors); builderBV._newLinePredicate = algBV.ConvertFromCharSet(solver, converter._builder._newLinePredicate); //Convert the BDD based AST to BV based AST SymbolicRegexNode <BV> rootBV = converter._builder.Transform(root, builderBV, bdd => builderBV._solver.ConvertFromCharSet(solver, bdd)); _matcher = new SymbolicRegexMatcher <BV>(rootBV, solver, minterms, matchTimeout, culture); } else { // Use ulong to represent a predicate var alg64 = new BV64Algebra(solver, minterms); var builder64 = new SymbolicRegexBuilder <ulong>(alg64) { // The default constructor sets the following predicates to False, this update happens after the fact // It depends on whether anchors where used in the regex whether the predicates are actually different from False _wordLetterPredicateForAnchors = alg64.ConvertFromCharSet(solver, converter._builder._wordLetterPredicateForAnchors), _newLinePredicate = alg64.ConvertFromCharSet(solver, converter._builder._newLinePredicate) }; // Convert the BDD-based AST to ulong-based AST SymbolicRegexNode <ulong> root64 = converter._builder.Transform(root, builder64, bdd => builder64._solver.ConvertFromCharSet(solver, bdd)); _matcher = new SymbolicRegexMatcher <ulong>(root64, solver, minterms, matchTimeout, culture); } }
/// <summary>Constructs matcher for given symbolic regex.</summary> internal SymbolicRegexMatcher(SymbolicRegexNode <TSetType> sr, RegexCode code, CharSetSolver css, BDD[] minterms, TimeSpan matchTimeout, CultureInfo culture) { Debug.Assert(sr._builder._solver is BV64Algebra or BVAlgebra or CharSetSolver, $"Unsupported algebra: {sr._builder._solver}"); _pattern = sr; _builder = sr._builder; _checkTimeout = Regex.InfiniteMatchTimeout != matchTimeout; _timeout = (int)(matchTimeout.TotalMilliseconds + 0.5); // Round up, so it will be at least 1ms _partitions = _builder._solver switch { BV64Algebra bv64 => bv64._classifier, BVAlgebra bv => bv._classifier, _ => new MintermClassifier((CharSetSolver)(object)_builder._solver, minterms), }; if (code.FindOptimizations.FindMode != FindNextStartingPositionMode.NoSearch && code.FindOptimizations.LeadingAnchor == 0) // If there are any anchors, we're better off letting the DFA quickly do its job of determining whether there's a match. { _findOpts = code.FindOptimizations; } // Determine the number of initial states. If there's no anchor, only the default previous // character kind 0 is ever going to be used for all initial states. int statesCount = _pattern._info.ContainsSomeAnchor ? CharKind.CharKindCount : 1; // Create the initial states for the original pattern. var initialStates = new DfaMatchingState <TSetType> [statesCount]; for (uint i = 0; i < initialStates.Length; i++) { initialStates[i] = _builder.MkState(_pattern, i); } _initialStates = initialStates; // Create the dot-star pattern (a concatenation of any* with the original pattern) // and all of its initial states. _dotStarredPattern = _builder.MkConcat(_builder._anyStar, _pattern); var dotstarredInitialStates = new DfaMatchingState <TSetType> [statesCount]; for (uint i = 0; i < dotstarredInitialStates.Length; i++) { // Used to detect if initial state was reentered, // but observe that the behavior from the state may ultimately depend on the previous // input char e.g. possibly causing nullability of \b or \B or of a start-of-line anchor, // in that sense there can be several "versions" (not more than StateCount) of the initial state. DfaMatchingState <TSetType> state = _builder.MkState(_dotStarredPattern, i); state.IsInitialState = true; dotstarredInitialStates[i] = state; } _dotstarredInitialStates = dotstarredInitialStates; // Create the reverse pattern (the original pattern in reverse order) and all of its // initial states. _reversePattern = _pattern.Reverse(); var reverseInitialStates = new DfaMatchingState <TSetType> [statesCount]; for (uint i = 0; i < reverseInitialStates.Length; i++) { reverseInitialStates[i] = _builder.MkState(_reversePattern, i); } _reverseInitialStates = reverseInitialStates; // Initialize our fast-lookup for determining the character kind of ASCII characters. // This is only required when the pattern contains anchors, as otherwise there's only // ever a single kind used. if (_pattern._info.ContainsSomeAnchor) { var asciiCharKinds = new uint[128]; for (int i = 0; i < asciiCharKinds.Length; i++) { TSetType predicate2; uint charKind; if (i == '\n') { predicate2 = _builder._newLinePredicate; charKind = CharKind.Newline; } else { predicate2 = _builder._wordLetterPredicateForAnchors; charKind = CharKind.WordLetter; } asciiCharKinds[i] = _builder._solver.And(GetMinterm(i), predicate2).Equals(_builder._solver.False) ? 0 : charKind; } _asciiCharKinds = asciiCharKinds; } }