Example #1
0
        /// <summary>Create a Boolean classifier.</summary>
        /// <param name="solver">Character algebra (the algebra is not stored in the classifier)</param>
        /// <param name="bdd">Elements that map to true.</param>
        public BooleanClassifier(CharSetSolver solver, BDD bdd)
        {
            // We want to optimize for ASCII, so query the BDD for each ASCII character in
            // order to precompute a lookup table we'll use at match time.
            var ascii = new bool[128];

            for (int i = 0; i < ascii.Length; i++)
            {
                ascii[i] = bdd.Contains(i);
            }

            // At this point, we'll never consult the BDD for ASCII characters, so as an
            // optimization we can remove them from the BDD in hopes of simplifying it and making
            // it faster to query for the non-ASCII characters we will use it for. However, while
            // this is typically an optimization, it isn't always: the act of removing some
            // characters from the BDD can actually make the branching more complicated.  The
            // extreme case of this is when the BDD is True, meaning everything maps to True, which
            // is as simple a BDD as you can get.  In such a case, even though it's rare, this would
            // definitively be a deoptimization, so we avoid doing so.  Other trivial cases are handled
            // by And itself, e.g. if the BDD == False, then And will just return False.
            if (!bdd.IsFull)
            {
                bdd = solver.And(solver._nonAscii, bdd);
            }

            _ascii    = ascii;
            _nonAscii = bdd;
        }
Example #2
0
        /// <summary>Create a classifier that maps a character to the ID of its associated minterm.</summary>
        /// <param name="minterms">A BDD for classifying all characters (ASCII and non-ASCII) to their corresponding minterm IDs.</param>
        public MintermClassifier(BDD[] minterms)
        {
            Debug.Assert(minterms.Length > 0, "Requires at least");

            CharSetSolver solver = CharSetSolver.Instance;

            if (minterms.Length == 1)
            {
                // With only a single minterm, the mapping is trivial: everything maps to it (ID 0).
                // For ASCII, use an array containing all zeros.  For non-ASCII, use a BDD that maps everything to 0.
                _ascii    = AllAsciiIsZeroMintermArray;
                _nonAscii = solver.ReplaceTrue(BDD.True, 0);
                return;
            }

            // Create a multi-terminal BDD for mapping any character to its associated minterm.
            BDD anyCharacterToMintermId = BDD.False;

            for (int i = 0; i < minterms.Length; i++)
            {
                // Each supplied minterm BDD decides whether a given character maps to it or not.
                // We need to combine all of those into a multi-terminal BDD that decides which
                // minterm a character maps to.  To do that, we take each minterm BDD and replace
                // its True result with the ID of the minterm, such that a character that would
                // have returned True for that BDD now returns the minterm ID.
                BDD charToTargetMintermId = solver.ReplaceTrue(minterms[i], i);

                // Now union this BDD with the multi-terminal BDD we've built up thus far. Unioning
                // is valid because every character belongs to exactly one minterm and thus will
                // only map to an ID instead of False in exactly one of the input BDDs.
                anyCharacterToMintermId = solver.Or(anyCharacterToMintermId, charToTargetMintermId);
            }

            // Now that we have our mapping that supports any input character, we want to optimize for
            // ASCII inputs.  Rather than forcing every input ASCII character to consult the BDD at match
            // time, we precompute a lookup table, where each ASCII character can be used to index into the
            // array to determine the ID for its corresponding minterm.
            var ascii = new int[128];

            for (int i = 0; i < ascii.Length; i++)
            {
                ascii[i] = anyCharacterToMintermId.Find(i);
            }
            _ascii = ascii;

            // We can also further optimize the BDD in two ways:
            // 1. We can now remove the ASCII characters from it, as we'll always consult the lookup table first
            //    for ASCII inputs and thus will never use the BDD for them.  While optional (skipping this step will not
            //    affect correctness), removing the ASCII values from the BDD reduces the size of the multi-terminal BDD.
            // 2. We can check if every character now maps to the same minterm ID (the same terminal in the
            //    multi-terminal BDD).  This can be relatively common after (1) above is applied, as many
            //    patterns don't distinguish between any non-ASCII characters (e.g. "[0-9]*").  If every character
            //    in the BDD now maps to the same minterm, we can replace the BDD with a much simpler/faster/smaller one.
            BDD nonAsciiBDD = solver.And(anyCharacterToMintermId, solver._nonAscii);

            nonAsciiBDD = nonAsciiBDD.IsEssentiallyBoolean(out BDD? singleTerminalBDD) ? singleTerminalBDD : nonAsciiBDD;
            _nonAscii   = nonAsciiBDD;
        }
Example #3
0
            static char ChooseChar(Random random, BDD bdd, BDD ascii, CharSetSolver charSetSolver)
            {
                Debug.Assert(!bdd.IsEmpty);
                // Select characters from the visible ASCII range whenever possible
                BDD bdd1 = charSetSolver.And(bdd, ascii);

                (uint, uint)range = Choose(random, BDDRangeConverter.ToRanges(bdd1.IsEmpty ? bdd : bdd1));
                return((char)random.Next((int)range.Item1, (int)range.Item2 + 1));
            }
Example #4
0
        /// <summary>
        /// Assumes that set is a union of some minterms (or empty).
        /// If null then null is returned.
        /// </summary>
        public BitVector ConvertFromBDD(BDD set, CharSetSolver solver)
        {
            BDD[] partition = _minterms;

            BitVector result = Empty;

            for (int i = 0; i < partition.Length; i++)
            {
                if (!solver.IsEmpty(solver.And(partition[i], set)))
                {
                    result = BitVector.Or(result, _mintermVectors[i]);
                }
            }

            return(result);
        }
Example #5
0
        /// <summary>
        /// Assumes that set is a union of some minterms (or empty).
        /// If null then 0 is returned.
        /// </summary>
        public ulong ConvertFromBDD(BDD set, CharSetSolver solver)
        {
            BDD[] partition = _minterms;

            ulong result = 0;

            for (int i = 0; i < partition.Length; i++)
            {
                // Set the i'th bit if the i'th minterm is in the set.
                if (!solver.IsEmpty(solver.And(partition[i], set)))
                {
                    result |= (ulong)1 << i;
                }
            }

            return(result);
        }
Example #6
0
 public SymbolicRegexSampler(SymbolicRegexNode <TSet> root, int randomseed, bool negative)
 {
     _root = negative ? root._builder.Not(root) : root;
     // Treat 0 as no seed and instead choose a random seed randomly
     RandomSeed           = randomseed == 0 ? new Random().Next() : randomseed;
     _random              = new Random(RandomSeed);
     _solver              = root._builder._solver;
     _charSetSolver       = new CharSetSolver();
     _asciiWordCharacters = _charSetSolver.Or(new BDD[] {
         _charSetSolver.CreateSetFromRange('A', 'Z'),
         _charSetSolver.CreateSetFromRange('a', 'z'),
         _charSetSolver.CreateFromChar('_'),
         _charSetSolver.CreateSetFromRange('0', '9')
     });
     // Visible ASCII range for input character generation
     _ascii = _charSetSolver.CreateSetFromRange('\x20', '\x7E');
     _asciiNonWordCharacters = _charSetSolver.And(_ascii, _charSetSolver.Not(_asciiWordCharacters));
 }
Example #7
0
        public SymbolicRegexSampler(SymbolicRegexNode <S> root, int randomseed, bool negative)
        {
            _root = negative ? root._builder.Not(root) : root;
            // Treat 0 as no seed and instead choose a random seed randomly
            RandomSeed = randomseed == 0 ? new Random().Next() : randomseed;
            _random    = new Random(RandomSeed);
            _solver    = root._builder._solver;
            CharSetSolver bddSolver = CharSetSolver.Instance;

            _asciiWordCharacters = bddSolver.Or(new BDD[] {
                bddSolver.RangeConstraint('A', 'Z'),
                bddSolver.RangeConstraint('a', 'z'),
                bddSolver.CharConstraint('_'),
                bddSolver.RangeConstraint('0', '9')
            });
            // Visible ASCII range for input character generation
            _ascii = bddSolver.RangeConstraint('\x20', '\x7E');
            _asciiNonWordCharacters = bddSolver.And(_ascii, bddSolver.Not(_asciiWordCharacters));
        }
Example #8
0
        public override IEnumerable <string> SampleMatches(int k, int randomseed)
        {
            // Zero is treated as no seed, instead using a system provided one
            Random random = randomseed != 0 ? new Random(randomseed) : new Random();

            ISolver <TSet> solver        = _builder._solver;
            CharSetSolver  charSetSolver = _builder._charSetSolver;

            // Create helper BDDs for handling anchors and preferentially generating ASCII inputs
            BDD asciiWordCharacters = charSetSolver.Or(new BDD[] {
                charSetSolver.CreateBDDFromRange('A', 'Z'),
                charSetSolver.CreateBDDFromRange('a', 'z'),
                charSetSolver.CreateBDDFromChar('_'),
                charSetSolver.CreateBDDFromRange('0', '9')
            });
            // Visible ASCII range for input character generation
            BDD ascii = charSetSolver.CreateBDDFromRange('\x20', '\x7E');
            BDD asciiNonWordCharacters = charSetSolver.And(ascii, charSetSolver.Not(asciiWordCharacters));

            // Set up two sets of minterms, one with the additional special minterm for the last end-of-line
            Debug.Assert(_builder._minterms is not null);
            int[] mintermIdsWithoutZ = new int[_builder._minterms.Length];
            int[] mintermIdsWithZ    = new int[_builder._minterms.Length + 1];
            for (int i = 0; i < _builder._minterms.Length; ++i)
            {
                mintermIdsWithoutZ[i] = i;
                mintermIdsWithZ[i]    = i;
            }
            mintermIdsWithZ[_builder._minterms.Length] = _builder._minterms.Length;

            for (int i = 0; i < k; i++)
            {
                // Holds the generated input so far
                StringBuilder inputSoFar      = new();
                StringBuilder?latestCandidate = null;

                // Current set of states reached initially contains just the root
                NfaMatchingState states = new(_builder);
                // Here one could also consider previous characters for example for \b, \B, and ^ anchors
                // and initialize inputSoFar accordingly
                states.InitializeFrom(_initialStates[GetCharKind(ReadOnlySpan <char> .Empty, -1)]);
                CurrentState statesWrapper = new(states);

                // Used for end suffixes
                List <string> possibleEndings = new();

                while (true)
                {
                    Debug.Assert(states.NfaStateSet.Count > 0);

                    // Gather the possible endings for satisfying nullability
                    possibleEndings.Clear();
                    if (NfaStateHandler.CanBeNullable(ref statesWrapper))
                    {
                        // Unconditionally final state or end of the input due to \Z anchor for example
                        if (NfaStateHandler.IsNullable(ref statesWrapper) ||
                            NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.BeginningEnd))
                        {
                            possibleEndings.Add("");
                        }

                        // End of line due to end-of-line anchor
                        if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.Newline))
                        {
                            possibleEndings.Add("\n");
                        }

                        // Related to wordborder due to \b or \B
                        if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.WordLetter))
                        {
                            possibleEndings.Add(ChooseChar(random, asciiWordCharacters, ascii, charSetSolver).ToString());
                        }

                        // Related to wordborder due to \b or \B
                        if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.General))
                        {
                            possibleEndings.Add(ChooseChar(random, asciiNonWordCharacters, ascii, charSetSolver).ToString());
                        }
                    }

                    // If we have a possible ending, then store a candidate input
                    if (possibleEndings.Count > 0)
                    {
                        latestCandidate ??= new();
                        latestCandidate.Clear();
                        latestCandidate.Append(inputSoFar);
                        //Choose some suffix that allows some anchor (if any) to be nullable
                        latestCandidate.Append(Choose(random, possibleEndings));

                        // Choose to stop here based on a coin-toss
                        if (FlipBiasedCoin(random, SampleMatchesStoppingProbability))
                        {
                            yield return(latestCandidate.ToString());

                            break;
                        }
                    }

                    // Shuffle the minterms, including the last end-of-line marker if appropriate
                    int[] mintermIds = NfaStateHandler.StartsWithLineAnchor(_builder, ref statesWrapper) ?
                                       Shuffle(random, mintermIdsWithZ) :
                                       Shuffle(random, mintermIdsWithoutZ);
                    foreach (int mintermId in mintermIds)
                    {
                        bool success = NfaStateHandler.TakeTransition(_builder, ref statesWrapper, mintermId);
                        Debug.Assert(success);
                        if (states.NfaStateSet.Count > 0)
                        {
                            TSet minterm = _builder.GetMinterm(mintermId);
                            // Append a random member of the minterm
                            inputSoFar.Append(ChooseChar(random, ToBDD(minterm, solver, charSetSolver), ascii, charSetSolver));
                            break;
                        }
                        else
                        {
                            // The transition was a dead end, undo and continue to try another minterm
                            NfaStateHandler.UndoTransition(ref statesWrapper);
                        }
                    }

                    // In the case that there are no next states or input has become too large: stop here
                    if (states.NfaStateSet.Count == 0 || inputSoFar.Length > SampleMatchesMaxInputLength)
                    {
                        // Ending up here without an ending is unlikely but possible for example for infeasible patterns
                        // such as @"no\bway" or due to poor choice of c -- no anchor is enabled -- so this is a deadend.
                        if (latestCandidate != null)
                        {
                            yield return(latestCandidate.ToString());
                        }
                        break;
                    }
                }
            }