Example #1
0
        public SymbolicRegexSampler(SymbolicRegexNode <S> root, int randomseed, bool negative)
        {
            _root = negative ? root._builder.MkNot(root) : root;
            // Treat 0 as no seed and instead choose a random seed randomly
            RandomSeed = randomseed == 0 ? new Random().Next() : randomseed;
            _random    = new Random(RandomSeed);
            _solver    = root._builder._solver;
            ICharAlgebra <BDD> bddSolver = SymbolicRegexRunner.s_unicode._solver;

            _asciiWordCharacters = bddSolver.Or(new BDD[] {
                bddSolver.RangeConstraint('A', 'Z'),
                bddSolver.RangeConstraint('a', 'z'),
                bddSolver.CharConstraint('_'),
                bddSolver.RangeConstraint('0', '9')
            });
            // Visible ASCII range for input character generation
            _ascii = bddSolver.RangeConstraint('\x20', '\x7E');
            _asciiNonWordCharacters = bddSolver.And(_ascii, bddSolver.Not(_asciiWordCharacters));
        }
Example #2
0
        public BDD ConvertToCharSet(ICharAlgebra <BDD> solver, BV pred)
        {
            Debug.Assert(_partition is not null);

            // the result will be the union of all minterms in the set
            BDD res = solver.False;

            if (!pred.Equals(False))
            {
                for (int i = 0; i < _bits; i++)
                {
                    // include the i'th minterm in the union if the i'th bit is set
                    if (pred[i])
                    {
                        res = solver.Or(res, _partition[i]);
                    }
                }
            }

            return(res);
        }
Example #3
0
        public BDD ConvertToCharSet(BitVector pred)
        {
            BDD[] partition = _minterms;

            // the result will be the union of all minterms in the set
            BDD res = CharSetSolver.Instance.False;

            if (!pred.Equals(False))
            {
                for (int i = 0; i < partition.Length; i++)
                {
                    // include the i'th minterm in the union if the i'th bit is set
                    if (pred[i])
                    {
                        res = CharSetSolver.Instance.Or(res, partition[i]);
                    }
                }
            }

            return(res);
        }
Example #4
0
        public BDD ConvertToCharSet(ulong pred)
        {
            BDD[] partition = _minterms;

            // the result will be the union of all minterms in the set
            BDD res = BDD.False;

            if (pred != 0)
            {
                for (int i = 0; i < partition.Length; i++)
                {
                    // include the i'th minterm in the union if the i'th bit is set
                    if ((pred & ((ulong)1 << i)) != 0)
                    {
                        res = CharSetSolver.Instance.Or(res, partition[i]);
                    }
                }
            }

            return(res);
        }
Example #5
0
        public BDD ConvertToCharSet(ICharAlgebra <BDD> solver, ulong pred)
        {
            Debug.Assert(_partition is not null);

            // the result will be the union of all minterms in the set
            BDD res = BDD.False;

            if (pred != _false)
            {
                for (int i = 0; i < _bits; i++)
                {
                    // include the i'th minterm in the union if the i'th bit is set
                    if ((pred & ((ulong)1 << i)) != _false)
                    {
                        res = solver.Or(res, _partition[i]);
                    }
                }
            }

            return(res);
        }
Example #6
0
        public BDD ConvertToBDD(ulong set, CharSetSolver solver)
        {
            BDD[] partition = _minterms;

            // the result will be the union of all minterms in the set
            BDD result = BDD.False;

            if (set != 0)
            {
                for (int i = 0; i < partition.Length; i++)
                {
                    // include the i'th minterm in the union if the i'th bit is set
                    if ((set & ((ulong)1 << i)) != 0)
                    {
                        result = solver.Or(result, partition[i]);
                    }
                }
            }

            return(result);
        }
Example #7
0
        public BDD ConvertToBDD(BitVector set, CharSetSolver solver)
        {
            BDD[] partition = _minterms;

            // the result will be the union of all minterms in the set
            BDD result = solver.Empty;

            if (!set.Equals(Empty))
            {
                for (int i = 0; i < partition.Length; i++)
                {
                    // include the i'th minterm in the union if the i'th bit is set
                    if (set[i])
                    {
                        result = solver.Or(result, partition[i]);
                    }
                }
            }

            return(result);
        }
Example #8
0
        public BV?ConvertFromCharSet(BDDAlgebra alg, BDD set)
        {
            if (set == null)
            {
                return(null);
            }

            Debug.Assert(_partition is not null);

            BV res = False;

            for (int i = 0; i < _bits; i++)
            {
                BDD bdd_i = _partition[i];
                BDD conj  = alg.And(bdd_i, set);
                if (alg.IsSatisfiable(conj))
                {
                    res |= _minterms[i];
                }
            }

            return(res);
        }
Example #9
0
        /// <summary>Generator for BDD Unicode category definitions.</summary>
        /// <param name="namespacename">namespace for the class</param>
        /// <param name="classname">name of the class</param>
        /// <param name="path">path where the file classname.cs is written</param>
        public static void Generate(string namespacename, string classname, string path)
        {
            Debug.Assert(namespacename != null);
            Debug.Assert(classname != null);
            Debug.Assert(path != null);

            using StreamWriter sw = new StreamWriter($"{Path.Combine(path, classname)}.cs");
            sw.WriteLine(
                $@"// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

// This is a programmatically generated file from Regex.GenerateUnicodeTables.
// It provides serialized BDD Unicode category definitions for System.Environment.Version = {Environment.Version}

using System.Globalization;

namespace {namespacename}
{{
    internal static class {classname}
    {{");
            var catMap = new Dictionary <UnicodeCategory, Ranges>();

            foreach (UnicodeCategory c in Enum.GetValues <UnicodeCategory>())
            {
                catMap[c] = new Ranges();
            }

            Ranges whitespace      = new Ranges();
            Regex  whitespaceRegex = new(@"\s");

            for (int i = 0; i <= char.MaxValue; i++)
            {
                char ch = (char)i;
                catMap[char.GetUnicodeCategory(ch)].Add(ch);
                if (whitespaceRegex.IsMatch(ch.ToString()))
                {
                    whitespace.Add(ch);
                }
            }

            var charSetSolver = new CharSetSolver();

            sw.WriteLine("        /// <summary>Serialized BDD representation of the set of all whitespace characters.</summary>");
            sw.Write($"        public static ReadOnlySpan<byte> SerializedWhitespaceBDD => ");
            WriteByteArrayInitSyntax(sw, charSetSolver.CreateSetFromRanges(whitespace.ranges).SerializeToBytes());
            sw.WriteLine(";");

            // Generate a BDD representation of each UnicodeCategory.
            BDD[] catBDDs = new BDD[catMap.Count];
            for (int c = 0; c < catBDDs.Length; c++)
            {
                catBDDs[c] = charSetSolver.CreateSetFromRanges(catMap[(UnicodeCategory)c].ranges);
            }

            sw.WriteLine();
            sw.WriteLine("        /// <summary>Gets the serialized BDD representations of any defined UnicodeCategory.</summary>");
            sw.WriteLine("        public static ReadOnlySpan<byte> GetSerializedCategory(UnicodeCategory category) =>");
            sw.WriteLine("            (int)category switch");
            sw.WriteLine("            {");
            for (int i = 0; i < catBDDs.Length; i++)
            {
                sw.WriteLine($"                {i} => SerializedCategory{i}_{(UnicodeCategory)i},");
            }
            sw.WriteLine($"                _ => default,");
            sw.WriteLine("            };");

            for (int i = 0; i < catBDDs.Length; i++)
            {
                sw.WriteLine();
                sw.WriteLine($"        /// <summary>Serialized BDD representation of the set of all characters in UnicodeCategory.{(UnicodeCategory)i}.</summary>");
                sw.Write($"        private static ReadOnlySpan<byte> SerializedCategory{i}_{(UnicodeCategory)i} => ");
                WriteByteArrayInitSyntax(sw, catBDDs[i].SerializeToBytes());
                sw.WriteLine(";");
            }

            sw.WriteLine($@"    }}
}}");
Example #10
0
        public override IEnumerable <string> SampleMatches(int k, int randomseed)
        {
            // Zero is treated as no seed, instead using a system provided one
            Random random = randomseed != 0 ? new Random(randomseed) : new Random();

            ISolver <TSet> solver        = _builder._solver;
            CharSetSolver  charSetSolver = _builder._charSetSolver;

            // Create helper BDDs for handling anchors and preferentially generating ASCII inputs
            BDD asciiWordCharacters = charSetSolver.Or(new BDD[] {
                charSetSolver.CreateBDDFromRange('A', 'Z'),
                charSetSolver.CreateBDDFromRange('a', 'z'),
                charSetSolver.CreateBDDFromChar('_'),
                charSetSolver.CreateBDDFromRange('0', '9')
            });
            // Visible ASCII range for input character generation
            BDD ascii = charSetSolver.CreateBDDFromRange('\x20', '\x7E');
            BDD asciiNonWordCharacters = charSetSolver.And(ascii, charSetSolver.Not(asciiWordCharacters));

            // Set up two sets of minterms, one with the additional special minterm for the last end-of-line
            Debug.Assert(_builder._minterms is not null);
            int[] mintermIdsWithoutZ = new int[_builder._minterms.Length];
            int[] mintermIdsWithZ    = new int[_builder._minterms.Length + 1];
            for (int i = 0; i < _builder._minterms.Length; ++i)
            {
                mintermIdsWithoutZ[i] = i;
                mintermIdsWithZ[i]    = i;
            }
            mintermIdsWithZ[_builder._minterms.Length] = _builder._minterms.Length;

            for (int i = 0; i < k; i++)
            {
                // Holds the generated input so far
                StringBuilder inputSoFar      = new();
                StringBuilder?latestCandidate = null;

                // Current set of states reached initially contains just the root
                NfaMatchingState states = new(_builder);
                // Here one could also consider previous characters for example for \b, \B, and ^ anchors
                // and initialize inputSoFar accordingly
                states.InitializeFrom(_initialStates[GetCharKind(ReadOnlySpan <char> .Empty, -1)]);
                CurrentState statesWrapper = new(states);

                // Used for end suffixes
                List <string> possibleEndings = new();

                while (true)
                {
                    Debug.Assert(states.NfaStateSet.Count > 0);

                    // Gather the possible endings for satisfying nullability
                    possibleEndings.Clear();
                    if (NfaStateHandler.CanBeNullable(ref statesWrapper))
                    {
                        // Unconditionally final state or end of the input due to \Z anchor for example
                        if (NfaStateHandler.IsNullable(ref statesWrapper) ||
                            NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.BeginningEnd))
                        {
                            possibleEndings.Add("");
                        }

                        // End of line due to end-of-line anchor
                        if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.Newline))
                        {
                            possibleEndings.Add("\n");
                        }

                        // Related to wordborder due to \b or \B
                        if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.WordLetter))
                        {
                            possibleEndings.Add(ChooseChar(random, asciiWordCharacters, ascii, charSetSolver).ToString());
                        }

                        // Related to wordborder due to \b or \B
                        if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.General))
                        {
                            possibleEndings.Add(ChooseChar(random, asciiNonWordCharacters, ascii, charSetSolver).ToString());
                        }
                    }

                    // If we have a possible ending, then store a candidate input
                    if (possibleEndings.Count > 0)
                    {
                        latestCandidate ??= new();
                        latestCandidate.Clear();
                        latestCandidate.Append(inputSoFar);
                        //Choose some suffix that allows some anchor (if any) to be nullable
                        latestCandidate.Append(Choose(random, possibleEndings));

                        // Choose to stop here based on a coin-toss
                        if (FlipBiasedCoin(random, SampleMatchesStoppingProbability))
                        {
                            yield return(latestCandidate.ToString());

                            break;
                        }
                    }

                    // Shuffle the minterms, including the last end-of-line marker if appropriate
                    int[] mintermIds = NfaStateHandler.StartsWithLineAnchor(_builder, ref statesWrapper) ?
                                       Shuffle(random, mintermIdsWithZ) :
                                       Shuffle(random, mintermIdsWithoutZ);
                    foreach (int mintermId in mintermIds)
                    {
                        bool success = NfaStateHandler.TakeTransition(_builder, ref statesWrapper, mintermId);
                        Debug.Assert(success);
                        if (states.NfaStateSet.Count > 0)
                        {
                            TSet minterm = _builder.GetMinterm(mintermId);
                            // Append a random member of the minterm
                            inputSoFar.Append(ChooseChar(random, ToBDD(minterm, solver, charSetSolver), ascii, charSetSolver));
                            break;
                        }
                        else
                        {
                            // The transition was a dead end, undo and continue to try another minterm
                            NfaStateHandler.UndoTransition(ref statesWrapper);
                        }
                    }

                    // In the case that there are no next states or input has become too large: stop here
                    if (states.NfaStateSet.Count == 0 || inputSoFar.Length > SampleMatchesMaxInputLength)
                    {
                        // Ending up here without an ending is unlikely but possible for example for infeasible patterns
                        // such as @"no\bway" or due to poor choice of c -- no anchor is enabled -- so this is a deadend.
                        if (latestCandidate != null)
                        {
                            yield return(latestCandidate.ToString());
                        }
                        break;
                    }
                }
            }
Example #11
0
        /// <summary>Converts the root <see cref="RegexNode"/> into its corresponding <see cref="SymbolicRegexNode{S}"/>.</summary>
        /// <param name="root">The root node to convert.</param>
        /// <returns>The generated <see cref="SymbolicRegexNode{S}"/> that corresponds to the supplied <paramref name="root"/>.</returns>
        internal SymbolicRegexNode <BDD> ConvertToSymbolicRegexNode(RegexNode root)
        {
            Debug.Assert(_builder is not null);

            // Create the root list that will store the built-up result.
            DoublyLinkedList <SymbolicRegexNode <BDD> > rootResult = new();

            // Create a stack to be processed in order to process iteratively rather than recursively, and push the root on.
            Stack <(RegexNode Node, bool TryToMarkFixedLength, DoublyLinkedList <SymbolicRegexNode <BDD> > Result, DoublyLinkedList <SymbolicRegexNode <BDD> >[]? ChildResults)> stack = new();

            stack.Push((root, true, rootResult, CreateChildResultArray(root.ChildCount())));

            // Continue to iterate until the stack is empty, popping the next item on each iteration.
            // Some popped items may be pushed back on as part of processing.
            while (stack.TryPop(out (RegexNode Node, bool TryToMarkFixedLength, DoublyLinkedList <SymbolicRegexNode <BDD> > Result, DoublyLinkedList <SymbolicRegexNode <BDD> >[]? ChildResults)popped))
            {
                RegexNode node = popped.Node;
                DoublyLinkedList <SymbolicRegexNode <BDD> > result = popped.Result;
                DoublyLinkedList <SymbolicRegexNode <BDD> >[]? childResults = popped.ChildResults;
                Debug.Assert(childResults is null || childResults.Length != 0);

                if (childResults is null || childResults[0] is null)
                {
                    // Child nodes have not been converted yet
                    // Handle each node kind as-is appropriate.
                    switch (node.Kind)
                    {
                    // Singletons and multis

                    case RegexNodeKind.One:
                        result.AddLast(_builder.CreateSingleton(_builder._solver.CreateFromChar(node.Ch)));
                        break;

                    case RegexNodeKind.Notone:
                        result.AddLast(_builder.CreateSingleton(_builder._solver.Not(_builder._solver.CreateFromChar(node.Ch))));
                        break;

                    case RegexNodeKind.Set:
                        result.AddLast(ConvertSet(node));
                        break;

                    case RegexNodeKind.Multi:
                    {
                        // Create a BDD for each character in the string and concatenate them.
                        string?str = node.Str;
                        Debug.Assert(str is not null);
                        foreach (char c in str)
                        {
                            result.AddLast(_builder.CreateSingleton(_builder._solver.CreateFromChar(c)));
                        }
                        break;
                    }

                    // The following five cases are the only node kinds that are pushed twice:
                    // Joins, general loops, and supported captures

                    case RegexNodeKind.Concatenate:
                    case RegexNodeKind.Alternate:
                    case RegexNodeKind.Loop:
                    case RegexNodeKind.Lazyloop:
                    case RegexNodeKind.Capture when node.N == -1:     // N == -1 because balancing groups (which have N >= 0) aren't supported
                    {
                        Debug.Assert(childResults is not null && childResults.Length == node.ChildCount());

                        // Push back the temporarily popped item. Next time this work item is seen, its ChildResults list will be ready.
                        // Propagate the length mark check only in case of alternation.
                        stack.Push(popped);
                        bool mark = node.Kind == RegexNodeKind.Alternate && popped.TryToMarkFixedLength;

                        // Push all the children to be converted
                        for (int i = 0; i < node.ChildCount(); ++i)
                        {
                            childResults[i] = new DoublyLinkedList <SymbolicRegexNode <BDD> >();
                            stack.Push((node.Child(i), mark, childResults[i], CreateChildResultArray(node.Child(i).ChildCount())));
                        }
                        break;
                    }

                    // Specialized loops

                    case RegexNodeKind.Oneloop:
                    case RegexNodeKind.Onelazy:
                    case RegexNodeKind.Notoneloop:
                    case RegexNodeKind.Notonelazy:
                    {
                        // Create a BDD that represents the character, then create a loop around it.
                        BDD bdd = _builder._solver.CreateFromChar(node.Ch);
                        if (node.IsNotoneFamily)
                        {
                            bdd = _builder._solver.Not(bdd);
                        }
                        result.AddLast(_builder.CreateLoop(_builder.CreateSingleton(bdd), node.Kind is RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy, node.M, node.N));
                        break;
                    }

                    case RegexNodeKind.Setloop:
                    case RegexNodeKind.Setlazy:
                    {
                        // Create a BDD that represents the set string, then create a loop around it.
                        string?set = node.Str;
                        Debug.Assert(set is not null);
                        BDD setBdd = CreateBDDFromSetString(set);
                        result.AddLast(_builder.CreateLoop(_builder.CreateSingleton(setBdd), node.Kind == RegexNodeKind.Setlazy, node.M, node.N));
                        break;
                    }

                    case RegexNodeKind.Empty:
                    case RegexNodeKind.UpdateBumpalong:     // UpdateBumpalong is a directive relevant only to backtracking and can be ignored just like Empty
                        break;

                    case RegexNodeKind.Nothing:
                        result.AddLast(_builder._nothing);
                        break;

                    // Anchors

                    case RegexNodeKind.Beginning:
                        result.AddLast(_builder.BeginningAnchor);
                        break;

                    case RegexNodeKind.Bol:
                        EnsureNewlinePredicateInitialized();
                        result.AddLast(_builder.BolAnchor);
                        break;

                    case RegexNodeKind.End:      // \z anchor
                        result.AddLast(_builder.EndAnchor);
                        break;

                    case RegexNodeKind.EndZ:     // \Z anchor
                        EnsureNewlinePredicateInitialized();
                        result.AddLast(_builder.EndAnchorZ);
                        break;

                    case RegexNodeKind.Eol:
                        EnsureNewlinePredicateInitialized();
                        result.AddLast(_builder.EolAnchor);
                        break;

                    case RegexNodeKind.Boundary:
                        EnsureWordLetterPredicateInitialized();
                        result.AddLast(_builder.BoundaryAnchor);
                        break;

                    case RegexNodeKind.NonBoundary:
                        EnsureWordLetterPredicateInitialized();
                        result.AddLast(_builder.NonBoundaryAnchor);
                        break;

                    // Unsupported

                    default:
                        throw new NotSupportedException(SR.Format(SR.NotSupported_NonBacktrackingConflictingExpression, node.Kind switch
                        {
                            RegexNodeKind.Atomic or RegexNodeKind.Setloopatomic or RegexNodeKind.Oneloopatomic or RegexNodeKind.Notoneloopatomic => SR.ExpressionDescription_AtomicSubexpressions,
                            RegexNodeKind.Backreference => SR.ExpressionDescription_Backreference,
                            RegexNodeKind.BackreferenceConditional => SR.ExpressionDescription_Conditional,
                            RegexNodeKind.Capture => SR.ExpressionDescription_BalancingGroup,
                            RegexNodeKind.ExpressionConditional => SR.ExpressionDescription_IfThenElse,
                            RegexNodeKind.NegativeLookaround => SR.ExpressionDescription_NegativeLookaround,
                            RegexNodeKind.PositiveLookaround => SR.ExpressionDescription_PositiveLookaround,
                            RegexNodeKind.Start => SR.ExpressionDescription_ContiguousMatches,
                            _ => UnexpectedNodeType(node)
                        }));
 /// <summary>Gets a <see cref="BDD"/> that represents the specified <see cref="UnicodeCategory"/>.</summary>
 public static BDD GetCategory(UnicodeCategory category) =>
 Volatile.Read(ref s_categories[(int)category]) ??
 Interlocked.CompareExchange(ref s_categories[(int)category], BDD.Deserialize(UnicodeCategoryRanges.GetSerializedCategory(category)), null) ??
 s_categories[(int)category] !;