Beispiel #1
0
        /// <summary>Create a classifier that maps a character to the ID of its associated minterm.</summary>
        /// <param name="minterms">A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs.</param>
        public MintermClassifier(BDD[] minterms)
        {
            Debug.Assert(minterms.Length > 0, "Requires at least");

            CharSetSolver solver = CharSetSolver.Instance;

            if (minterms.Length == 1)
            {
                // With only a single minterm, the mapping is trivial: everything maps to it (ID 0).
                // For ASCII, use an array containing all zeros.  For non-ASCII, use a BDD that maps everything to 0.
                _ascii    = AllAsciiIsZeroMintermArray;
                _nonAscii = solver.ReplaceTrue(BDD.True, 0);
                return;
            }

            // Create a multi-terminal BDD for mapping any character to its associated minterm.
            BDD anyCharacterToMintermId = BDD.False;

            for (int i = 0; i < minterms.Length; i++)
            {
                // Each supplied minterm BDD decides whether a given character maps to it or not.
                // We need to combine all of those into a multi-terminal BDD that decides which
                // minterm a character maps to.  To do that, we take each minterm BDD and replace
                // its True result with the ID of the minterm, such that a character that would
                // have returned True for that BDD now returns the minterm ID.
                BDD charToTargetMintermId = solver.ReplaceTrue(minterms[i], i);

                // Now union this BDD with the multi-terminal BDD we've built up thus far. Unioning
                // is valid because every character belongs to exactly one minterm and thus will
                // only map to an ID instead of False in exactly one of the input BDDs.
                anyCharacterToMintermId = solver.Or(anyCharacterToMintermId, charToTargetMintermId);
            }

            // Now that we have our mapping that supports any input character, we want to optimize for
            // ASCII inputs.  Rather than forcing every input ASCII character to consult the BDD at match
            // time, we precompute a lookup table, where each ASCII character can be used to index into the
            // array to determine the ID for its corresponding minterm.
            var ascii = new int[128];

            for (int i = 0; i < ascii.Length; i++)
            {
                ascii[i] = anyCharacterToMintermId.Find(i);
            }
            _ascii = ascii;

            // We can also further optimize the BDD in two ways:
            // 1. We can now remove the ASCII characters from it, as we'll always consult the lookup table first
            //    for ASCII inputs and thus will never use the BDD for them.  While optional (skipping this step will not
            //    affect correctness), removing the ASCII values from the BDD reduces the size of the multi-terminal BDD.
            // 2. We can check if every character now maps to the same minterm ID (the same terminal in the
            //    multi-terminal BDD).  This can be relatively common after (1) above is applied, as many
            //    patterns don't distinguish between any non-ASCII characters (e.g. "[0-9]*").  If every character
            //    in the BDD now maps to the same minterm, we can replace the BDD with a much simpler/faster/smaller one.
            BDD nonAsciiBDD = solver.And(anyCharacterToMintermId, solver._nonAscii);

            nonAsciiBDD = nonAsciiBDD.IsEssentiallyBoolean(out BDD? singleTerminalBDD) ? singleTerminalBDD : nonAsciiBDD;
            _nonAscii   = nonAsciiBDD;
        }
 /// <summary>Gets a <see cref="BDD"/> that represents the \w character class.</summary>
 /// <remarks>\w is the union of the 8 categories: 0,1,2,3,4,5,8,18</remarks>
 public static BDD WordLetter(CharSetSolver solver) =>
 s_wordLetter ??
 Interlocked.CompareExchange(ref s_wordLetter,
                             solver.Or(new[]
 {
     GetCategory(UnicodeCategory.UppercaseLetter),
     GetCategory(UnicodeCategory.LowercaseLetter),
     GetCategory(UnicodeCategory.TitlecaseLetter),
     GetCategory(UnicodeCategory.ModifierLetter),
     GetCategory(UnicodeCategory.OtherLetter),
     GetCategory(UnicodeCategory.NonSpacingMark),
     GetCategory(UnicodeCategory.DecimalDigitNumber),
     GetCategory(UnicodeCategory.ConnectorPunctuation),
 }),
                             null) ??
 s_wordLetter;
Beispiel #3
0
 public SymbolicRegexSampler(SymbolicRegexNode <TSet> root, int randomseed, bool negative)
 {
     _root = negative ? root._builder.Not(root) : root;
     // Treat 0 as no seed and instead choose a random seed randomly
     RandomSeed           = randomseed == 0 ? new Random().Next() : randomseed;
     _random              = new Random(RandomSeed);
     _solver              = root._builder._solver;
     _charSetSolver       = new CharSetSolver();
     _asciiWordCharacters = _charSetSolver.Or(new BDD[] {
         _charSetSolver.CreateSetFromRange('A', 'Z'),
         _charSetSolver.CreateSetFromRange('a', 'z'),
         _charSetSolver.CreateFromChar('_'),
         _charSetSolver.CreateSetFromRange('0', '9')
     });
     // Visible ASCII range for input character generation
     _ascii = _charSetSolver.CreateSetFromRange('\x20', '\x7E');
     _asciiNonWordCharacters = _charSetSolver.And(_ascii, _charSetSolver.Not(_asciiWordCharacters));
 }
        public SymbolicRegexSampler(SymbolicRegexNode <S> root, int randomseed, bool negative)
        {
            _root = negative ? root._builder.Not(root) : root;
            // Treat 0 as no seed and instead choose a random seed randomly
            RandomSeed = randomseed == 0 ? new Random().Next() : randomseed;
            _random    = new Random(RandomSeed);
            _solver    = root._builder._solver;
            CharSetSolver bddSolver = CharSetSolver.Instance;

            _asciiWordCharacters = bddSolver.Or(new BDD[] {
                bddSolver.RangeConstraint('A', 'Z'),
                bddSolver.RangeConstraint('a', 'z'),
                bddSolver.CharConstraint('_'),
                bddSolver.RangeConstraint('0', '9')
            });
            // Visible ASCII range for input character generation
            _ascii = bddSolver.RangeConstraint('\x20', '\x7E');
            _asciiNonWordCharacters = bddSolver.And(_ascii, bddSolver.Not(_asciiWordCharacters));
        }
Beispiel #5
0
        public BDD ConvertToBDD(ulong set, CharSetSolver solver)
        {
            BDD[] partition = _minterms;

            // the result will be the union of all minterms in the set
            BDD result = BDD.False;

            if (set != 0)
            {
                for (int i = 0; i < partition.Length; i++)
                {
                    // include the i'th minterm in the union if the i'th bit is set
                    if ((set & ((ulong)1 << i)) != 0)
                    {
                        result = solver.Or(result, partition[i]);
                    }
                }
            }

            return(result);
        }
Beispiel #6
0
        public BDD ConvertToBDD(BitVector set, CharSetSolver solver)
        {
            BDD[] partition = _minterms;

            // the result will be the union of all minterms in the set
            BDD result = solver.Empty;

            if (!set.Equals(Empty))
            {
                for (int i = 0; i < partition.Length; i++)
                {
                    // include the i'th minterm in the union if the i'th bit is set
                    if (set[i])
                    {
                        result = solver.Or(result, partition[i]);
                    }
                }
            }

            return(result);
        }
Beispiel #7
0
        public override IEnumerable <string> SampleMatches(int k, int randomseed)
        {
            // Zero is treated as no seed, instead using a system provided one
            Random random = randomseed != 0 ? new Random(randomseed) : new Random();

            ISolver <TSet> solver        = _builder._solver;
            CharSetSolver  charSetSolver = _builder._charSetSolver;

            // Create helper BDDs for handling anchors and preferentially generating ASCII inputs
            BDD asciiWordCharacters = charSetSolver.Or(new BDD[] {
                charSetSolver.CreateBDDFromRange('A', 'Z'),
                charSetSolver.CreateBDDFromRange('a', 'z'),
                charSetSolver.CreateBDDFromChar('_'),
                charSetSolver.CreateBDDFromRange('0', '9')
            });
            // Visible ASCII range for input character generation
            BDD ascii = charSetSolver.CreateBDDFromRange('\x20', '\x7E');
            BDD asciiNonWordCharacters = charSetSolver.And(ascii, charSetSolver.Not(asciiWordCharacters));

            // Set up two sets of minterms, one with the additional special minterm for the last end-of-line
            Debug.Assert(_builder._minterms is not null);
            int[] mintermIdsWithoutZ = new int[_builder._minterms.Length];
            int[] mintermIdsWithZ    = new int[_builder._minterms.Length + 1];
            for (int i = 0; i < _builder._minterms.Length; ++i)
            {
                mintermIdsWithoutZ[i] = i;
                mintermIdsWithZ[i]    = i;
            }
            mintermIdsWithZ[_builder._minterms.Length] = _builder._minterms.Length;

            for (int i = 0; i < k; i++)
            {
                // Holds the generated input so far
                StringBuilder inputSoFar      = new();
                StringBuilder?latestCandidate = null;

                // Current set of states reached initially contains just the root
                NfaMatchingState states = new(_builder);
                // Here one could also consider previous characters for example for \b, \B, and ^ anchors
                // and initialize inputSoFar accordingly
                states.InitializeFrom(_initialStates[GetCharKind(ReadOnlySpan <char> .Empty, -1)]);
                CurrentState statesWrapper = new(states);

                // Used for end suffixes
                List <string> possibleEndings = new();

                while (true)
                {
                    Debug.Assert(states.NfaStateSet.Count > 0);

                    // Gather the possible endings for satisfying nullability
                    possibleEndings.Clear();
                    if (NfaStateHandler.CanBeNullable(ref statesWrapper))
                    {
                        // Unconditionally final state or end of the input due to \Z anchor for example
                        if (NfaStateHandler.IsNullable(ref statesWrapper) ||
                            NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.BeginningEnd))
                        {
                            possibleEndings.Add("");
                        }

                        // End of line due to end-of-line anchor
                        if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.Newline))
                        {
                            possibleEndings.Add("\n");
                        }

                        // Related to wordborder due to \b or \B
                        if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.WordLetter))
                        {
                            possibleEndings.Add(ChooseChar(random, asciiWordCharacters, ascii, charSetSolver).ToString());
                        }

                        // Related to wordborder due to \b or \B
                        if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.General))
                        {
                            possibleEndings.Add(ChooseChar(random, asciiNonWordCharacters, ascii, charSetSolver).ToString());
                        }
                    }

                    // If we have a possible ending, then store a candidate input
                    if (possibleEndings.Count > 0)
                    {
                        latestCandidate ??= new();
                        latestCandidate.Clear();
                        latestCandidate.Append(inputSoFar);
                        //Choose some suffix that allows some anchor (if any) to be nullable
                        latestCandidate.Append(Choose(random, possibleEndings));

                        // Choose to stop here based on a coin-toss
                        if (FlipBiasedCoin(random, SampleMatchesStoppingProbability))
                        {
                            yield return(latestCandidate.ToString());

                            break;
                        }
                    }

                    // Shuffle the minterms, including the last end-of-line marker if appropriate
                    int[] mintermIds = NfaStateHandler.StartsWithLineAnchor(_builder, ref statesWrapper) ?
                                       Shuffle(random, mintermIdsWithZ) :
                                       Shuffle(random, mintermIdsWithoutZ);
                    foreach (int mintermId in mintermIds)
                    {
                        bool success = NfaStateHandler.TakeTransition(_builder, ref statesWrapper, mintermId);
                        Debug.Assert(success);
                        if (states.NfaStateSet.Count > 0)
                        {
                            TSet minterm = _builder.GetMinterm(mintermId);
                            // Append a random member of the minterm
                            inputSoFar.Append(ChooseChar(random, ToBDD(minterm, solver, charSetSolver), ascii, charSetSolver));
                            break;
                        }
                        else
                        {
                            // The transition was a dead end, undo and continue to try another minterm
                            NfaStateHandler.UndoTransition(ref statesWrapper);
                        }
                    }

                    // In the case that there are no next states or input has become too large: stop here
                    if (states.NfaStateSet.Count == 0 || inputSoFar.Length > SampleMatchesMaxInputLength)
                    {
                        // Ending up here without an ending is unlikely but possible for example for infeasible patterns
                        // such as @"no\bway" or due to poor choice of c -- no anchor is enabled -- so this is a deadend.
                        if (latestCandidate != null)
                        {
                            yield return(latestCandidate.ToString());
                        }
                        break;
                    }
                }
            }
 /// <summary>
 /// Gets a <see cref="BDD"/> that represents <see cref="WordLetter"/> together with the characters
 /// \u200C (zero width non joiner) and \u200D (zero width joiner) that are treated as if they were
 /// word characters in the context of the anchors \b and \B.
 /// </summary>
 public static BDD WordLetterForAnchors(CharSetSolver solver) =>
 s_wordLetterForAnchors ??
 Interlocked.CompareExchange(ref s_wordLetterForAnchors, solver.Or(WordLetter(solver), solver.CreateBDDFromRange('\u200C', '\u200D')), null) ??
 s_wordLetterForAnchors;