/// <summary>Create a classifier that maps a character to the ID of its associated minterm.</summary> /// <param name="minterms">A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs.</param> public MintermClassifier(BDD[] minterms) { Debug.Assert(minterms.Length > 0, "Requires at least"); CharSetSolver solver = CharSetSolver.Instance; if (minterms.Length == 1) { // With only a single minterm, the mapping is trivial: everything maps to it (ID 0). // For ASCII, use an array containing all zeros. For non-ASCII, use a BDD that maps everything to 0. _ascii = AllAsciiIsZeroMintermArray; _nonAscii = solver.ReplaceTrue(BDD.True, 0); return; } // Create a multi-terminal BDD for mapping any character to its associated minterm. BDD anyCharacterToMintermId = BDD.False; for (int i = 0; i < minterms.Length; i++) { // Each supplied minterm BDD decides whether a given character maps to it or not. // We need to combine all of those into a multi-terminal BDD that decides which // minterm a character maps to. To do that, we take each minterm BDD and replace // its True result with the ID of the minterm, such that a character that would // have returned True for that BDD now returns the minterm ID. BDD charToTargetMintermId = solver.ReplaceTrue(minterms[i], i); // Now union this BDD with the multi-terminal BDD we've built up thus far. Unioning // is valid because every character belongs to exactly one minterm and thus will // only map to an ID instead of False in exactly one of the input BDDs. anyCharacterToMintermId = solver.Or(anyCharacterToMintermId, charToTargetMintermId); } // Now that we have our mapping that supports any input character, we want to optimize for // ASCII inputs. Rather than forcing every input ASCII character to consult the BDD at match // time, we precompute a lookup table, where each ASCII character can be used to index into the // array to determine the ID for its corresponding minterm. var ascii = new int[128]; for (int i = 0; i < ascii.Length; i++) { ascii[i] = anyCharacterToMintermId.Find(i); } _ascii = ascii; // We can also further optimize the BDD in two ways: // 1. We can now remove the ASCII characters from it, as we'll always consult the lookup table first // for ASCII inputs and thus will never use the BDD for them. While optional (skipping this step will not // affect correctness), removing the ASCII values from the BDD reduces the size of the multi-terminal BDD. // 2. We can check if every character now maps to the same minterm ID (the same terminal in the // multi-terminal BDD). This can be relatively common after (1) above is applied, as many // patterns don't distinguish between any non-ASCII characters (e.g. "[0-9]*"). If every character // in the BDD now maps to the same minterm, we can replace the BDD with a much simpler/faster/smaller one. BDD nonAsciiBDD = solver.And(anyCharacterToMintermId, solver._nonAscii); nonAsciiBDD = nonAsciiBDD.IsEssentiallyBoolean(out BDD? singleTerminalBDD) ? singleTerminalBDD : nonAsciiBDD; _nonAscii = nonAsciiBDD; }
/// <summary>Gets a <see cref="BDD"/> that represents the \w character class.</summary> /// <remarks>\w is the union of the 8 categories: 0,1,2,3,4,5,8,18</remarks> public static BDD WordLetter(CharSetSolver solver) => s_wordLetter ?? Interlocked.CompareExchange(ref s_wordLetter, solver.Or(new[] { GetCategory(UnicodeCategory.UppercaseLetter), GetCategory(UnicodeCategory.LowercaseLetter), GetCategory(UnicodeCategory.TitlecaseLetter), GetCategory(UnicodeCategory.ModifierLetter), GetCategory(UnicodeCategory.OtherLetter), GetCategory(UnicodeCategory.NonSpacingMark), GetCategory(UnicodeCategory.DecimalDigitNumber), GetCategory(UnicodeCategory.ConnectorPunctuation), }), null) ?? s_wordLetter;
public SymbolicRegexSampler(SymbolicRegexNode <TSet> root, int randomseed, bool negative) { _root = negative ? root._builder.Not(root) : root; // Treat 0 as no seed and instead choose a random seed randomly RandomSeed = randomseed == 0 ? new Random().Next() : randomseed; _random = new Random(RandomSeed); _solver = root._builder._solver; _charSetSolver = new CharSetSolver(); _asciiWordCharacters = _charSetSolver.Or(new BDD[] { _charSetSolver.CreateSetFromRange('A', 'Z'), _charSetSolver.CreateSetFromRange('a', 'z'), _charSetSolver.CreateFromChar('_'), _charSetSolver.CreateSetFromRange('0', '9') }); // Visible ASCII range for input character generation _ascii = _charSetSolver.CreateSetFromRange('\x20', '\x7E'); _asciiNonWordCharacters = _charSetSolver.And(_ascii, _charSetSolver.Not(_asciiWordCharacters)); }
public SymbolicRegexSampler(SymbolicRegexNode <S> root, int randomseed, bool negative) { _root = negative ? root._builder.Not(root) : root; // Treat 0 as no seed and instead choose a random seed randomly RandomSeed = randomseed == 0 ? new Random().Next() : randomseed; _random = new Random(RandomSeed); _solver = root._builder._solver; CharSetSolver bddSolver = CharSetSolver.Instance; _asciiWordCharacters = bddSolver.Or(new BDD[] { bddSolver.RangeConstraint('A', 'Z'), bddSolver.RangeConstraint('a', 'z'), bddSolver.CharConstraint('_'), bddSolver.RangeConstraint('0', '9') }); // Visible ASCII range for input character generation _ascii = bddSolver.RangeConstraint('\x20', '\x7E'); _asciiNonWordCharacters = bddSolver.And(_ascii, bddSolver.Not(_asciiWordCharacters)); }
public BDD ConvertToBDD(ulong set, CharSetSolver solver) { BDD[] partition = _minterms; // the result will be the union of all minterms in the set BDD result = BDD.False; if (set != 0) { for (int i = 0; i < partition.Length; i++) { // include the i'th minterm in the union if the i'th bit is set if ((set & ((ulong)1 << i)) != 0) { result = solver.Or(result, partition[i]); } } } return(result); }
public BDD ConvertToBDD(BitVector set, CharSetSolver solver) { BDD[] partition = _minterms; // the result will be the union of all minterms in the set BDD result = solver.Empty; if (!set.Equals(Empty)) { for (int i = 0; i < partition.Length; i++) { // include the i'th minterm in the union if the i'th bit is set if (set[i]) { result = solver.Or(result, partition[i]); } } } return(result); }
public override IEnumerable <string> SampleMatches(int k, int randomseed) { // Zero is treated as no seed, instead using a system provided one Random random = randomseed != 0 ? new Random(randomseed) : new Random(); ISolver <TSet> solver = _builder._solver; CharSetSolver charSetSolver = _builder._charSetSolver; // Create helper BDDs for handling anchors and preferentially generating ASCII inputs BDD asciiWordCharacters = charSetSolver.Or(new BDD[] { charSetSolver.CreateBDDFromRange('A', 'Z'), charSetSolver.CreateBDDFromRange('a', 'z'), charSetSolver.CreateBDDFromChar('_'), charSetSolver.CreateBDDFromRange('0', '9') }); // Visible ASCII range for input character generation BDD ascii = charSetSolver.CreateBDDFromRange('\x20', '\x7E'); BDD asciiNonWordCharacters = charSetSolver.And(ascii, charSetSolver.Not(asciiWordCharacters)); // Set up two sets of minterms, one with the additional special minterm for the last end-of-line Debug.Assert(_builder._minterms is not null); int[] mintermIdsWithoutZ = new int[_builder._minterms.Length]; int[] mintermIdsWithZ = new int[_builder._minterms.Length + 1]; for (int i = 0; i < _builder._minterms.Length; ++i) { mintermIdsWithoutZ[i] = i; mintermIdsWithZ[i] = i; } mintermIdsWithZ[_builder._minterms.Length] = _builder._minterms.Length; for (int i = 0; i < k; i++) { // Holds the generated input so far StringBuilder inputSoFar = new(); StringBuilder?latestCandidate = null; // Current set of states reached initially contains just the root NfaMatchingState states = new(_builder); // Here one could also consider previous characters for example for \b, \B, and ^ anchors // and initialize inputSoFar accordingly states.InitializeFrom(_initialStates[GetCharKind(ReadOnlySpan <char> .Empty, -1)]); CurrentState statesWrapper = new(states); // Used for end suffixes List <string> possibleEndings = new(); while (true) { Debug.Assert(states.NfaStateSet.Count > 0); // Gather the possible endings for satisfying nullability possibleEndings.Clear(); if (NfaStateHandler.CanBeNullable(ref statesWrapper)) { // Unconditionally final state or end of the input due to \Z anchor for example if (NfaStateHandler.IsNullable(ref statesWrapper) || NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.BeginningEnd)) { possibleEndings.Add(""); } // End of line due to end-of-line anchor if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.Newline)) { possibleEndings.Add("\n"); } // Related to wordborder due to \b or \B if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.WordLetter)) { possibleEndings.Add(ChooseChar(random, asciiWordCharacters, ascii, charSetSolver).ToString()); } // Related to wordborder due to \b or \B if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.General)) { possibleEndings.Add(ChooseChar(random, asciiNonWordCharacters, ascii, charSetSolver).ToString()); } } // If we have a possible ending, then store a candidate input if (possibleEndings.Count > 0) { latestCandidate ??= new(); latestCandidate.Clear(); latestCandidate.Append(inputSoFar); //Choose some suffix that allows some anchor (if any) to be nullable latestCandidate.Append(Choose(random, possibleEndings)); // Choose to stop here based on a coin-toss if (FlipBiasedCoin(random, SampleMatchesStoppingProbability)) { yield return(latestCandidate.ToString()); break; } } // Shuffle the minterms, including the last end-of-line marker if appropriate int[] mintermIds = NfaStateHandler.StartsWithLineAnchor(_builder, ref statesWrapper) ? Shuffle(random, mintermIdsWithZ) : Shuffle(random, mintermIdsWithoutZ); foreach (int mintermId in mintermIds) { bool success = NfaStateHandler.TakeTransition(_builder, ref statesWrapper, mintermId); Debug.Assert(success); if (states.NfaStateSet.Count > 0) { TSet minterm = _builder.GetMinterm(mintermId); // Append a random member of the minterm inputSoFar.Append(ChooseChar(random, ToBDD(minterm, solver, charSetSolver), ascii, charSetSolver)); break; } else { // The transition was a dead end, undo and continue to try another minterm NfaStateHandler.UndoTransition(ref statesWrapper); } } // In the case that there are no next states or input has become too large: stop here if (states.NfaStateSet.Count == 0 || inputSoFar.Length > SampleMatchesMaxInputLength) { // Ending up here without an ending is unlikely but possible for example for infeasible patterns // such as @"no\bway" or due to poor choice of c -- no anchor is enabled -- so this is a deadend. if (latestCandidate != null) { yield return(latestCandidate.ToString()); } break; } } }
/// <summary> /// Gets a <see cref="BDD"/> that represents <see cref="WordLetter"/> together with the characters /// \u200C (zero width non joiner) and \u200D (zero width joiner) that are treated as if they were /// word characters in the context of the anchors \b and \B. /// </summary> public static BDD WordLetterForAnchors(CharSetSolver solver) => s_wordLetterForAnchors ?? Interlocked.CompareExchange(ref s_wordLetterForAnchors, solver.Or(WordLetter(solver), solver.CreateBDDFromRange('\u200C', '\u200D')), null) ?? s_wordLetterForAnchors;