예제 #1
0
            static bool TryAnalyze(RegexNode node, AnalysisResults results, bool isAtomicByAncestor, bool isInLoop)
            {
                if (!StackHelper.TryEnsureSufficientExecutionStack())
                {
                    return(false);
                }

                // Track whether we've seen any nodes with various options set.
                results._hasIgnoreCase  |= (node.Options & RegexOptions.IgnoreCase) != 0;
                results._hasRightToLeft |= (node.Options & RegexOptions.RightToLeft) != 0;

                // Track whether this node is inside of a loop.
                if (isInLoop)
                {
                    (results._inLoops ??= new HashSet <RegexNode>()).Add(node);
                }

                if (isAtomicByAncestor)
                {
                    // We've been told by our parent that we should be considered atomic, so add ourselves
                    // to the atomic collection.
                    results._isAtomicByAncestor.Add(node);
                }
                else
                {
                    // Certain kinds of nodes incur backtracking logic themselves: add them to the backtracking collection.
                    // We may later find that a node contains another that has backtracking; we'll add nodes based on that
                    // after examining the children.
                    switch (node.Kind)
                    {
                    case RegexNodeKind.Alternate:
                    case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when node.M != node.N:
                    case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop or RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy when node.M != node.N:
                        (results._mayBacktrack ??= new HashSet <RegexNode>()).Add(node);
                        break;
                    }
                }

                // Update state for certain node types.
                bool isAtomicBySelf = false;

                switch (node.Kind)
                {
                // Some node types add atomicity around what they wrap.  Set isAtomicBySelfOrParent to true for such nodes
                // even if it was false upon entering the method.
                case RegexNodeKind.Atomic:
                case RegexNodeKind.NegativeLookaround:
                case RegexNodeKind.PositiveLookaround:
                    isAtomicBySelf = true;
                    break;

                // Track any nodes that are themselves captures.
                case RegexNodeKind.Capture:
                    results._containsCapture.Add(node);
                    break;

                // Track whether we've recurred into a loop
                case RegexNodeKind.Loop:
                case RegexNodeKind.Lazyloop:
                    isInLoop = true;
                    break;
                }

                // Process each child.
                int childCount = node.ChildCount();

                for (int i = 0; i < childCount; i++)
                {
                    RegexNode child = node.Child(i);

                    // Determine whether the child should be treated as atomic (whether anything
                    // can backtrack into it), which is influenced by whether this node (the child's
                    // parent) is considered atomic by itself or by its parent.
                    bool treatChildAsAtomic = (isAtomicByAncestor | isAtomicBySelf) && node.Kind switch
                    {
                        // If the parent is atomic, so is the child.  That's the whole purpose
                        // of the Atomic node, and lookarounds are also implicitly atomic.
                        RegexNodeKind.Atomic or RegexNodeKind.NegativeLookaround or RegexNodeKind.PositiveLookaround => true,

                        // Each branch is considered independently, so any atomicity applied to the alternation also applies
                        // to each individual branch.  This is true as well for conditionals.
                         RegexNodeKind.Alternate or RegexNodeKind.BackreferenceConditional or RegexNodeKind.ExpressionConditional => true,

                        // Captures don't impact atomicity: if the parent of a capture is atomic, the capture is also atomic.
                         RegexNodeKind.Capture => true,

                        // If the parent is a concatenation and this is the last node, any atomicity
                        // applying to the concatenation applies to this node, too.
                         RegexNodeKind.Concatenate => i == childCount - 1,

                        // For loops with a max iteration count of 1, they themselves can be considered
                        // atomic as can whatever they wrap, as they won't ever iterate more than once
                        // and thus we don't need to worry about one iteration consuming input destined
                        // for a subsequent iteration.
                         RegexNodeKind.Loop or RegexNodeKind.Lazyloop when node.N == 1 => true,

                        // For any other parent type, give up on trying to prove atomicity.
                         _ => false,
                    };

                    // Now analyze the child.
                    if (!TryAnalyze(child, results, treatChildAsAtomic, isInLoop))
                    {
                        return(false);
                    }

                    // If the child contains captures, so too does this parent.
                    if (results._containsCapture.Contains(child))
                    {
                        results._containsCapture.Add(node);
                    }

                    // If the child might require backtracking into it, so too might the parent,
                    // unless the parent is itself considered atomic.  Here we don't consider parental
                    // atomicity, as we need to surface upwards to the parent whether any backtracking
                    // will be visible from this node to it.
                    if (!isAtomicBySelf && (results._mayBacktrack?.Contains(child) == true))
                    {
                        (results._mayBacktrack ??= new HashSet <RegexNode>()).Add(node);
                    }
                }

                // Successfully analyzed the node.
                return(true);
            }
예제 #2
0
            // Processes the node, adding any prefix text to the builder.
            // Returns whether processing should continue with subsequent nodes.
            static bool Process(RegexNode node, ref ValueStringBuilder vsb)
            {
                if (!StackHelper.TryEnsureSufficientExecutionStack())
                {
                    // If we're too deep on the stack, just give up finding any more prefix.
                    return(false);
                }

                // We don't bother to handle reversed input, so process at most one node
                // when handling RightToLeft.
                bool rtl = (node.Options & RegexOptions.RightToLeft) != 0;

                switch (node.Type)
                {
                // Concatenation
                case RegexNode.Concatenate:
                {
                    int childCount = node.ChildCount();
                    for (int i = 0; i < childCount; i++)
                    {
                        if (!Process(node.Child(i), ref vsb))
                        {
                            return(false);
                        }
                    }
                    return(!rtl);
                }

                // Alternation: find a string that's a shared prefix of all branches
                case RegexNode.Alternate:
                {
                    int childCount = node.ChildCount();

                    // Store the initial branch into the target builder
                    int  initialLength = vsb.Length;
                    bool keepExploring = Process(node.Child(0), ref vsb);
                    int  addedLength   = vsb.Length - initialLength;

                    // Then explore the rest of the branches, finding the length
                    // a prefix they all share in common with the initial branch.
                    if (addedLength != 0)
                    {
                        var alternateSb = new ValueStringBuilder(64);

                        // Process each branch.  If we reach a point where we've proven there's
                        // no overlap, we can bail early.
                        for (int i = 1; i < childCount && addedLength != 0; i++)
                        {
                            alternateSb.Length = 0;

                            // Process the branch.  We want to keep exploring after this alternation,
                            // but we can't if either this branch doesn't allow for it or if the prefix
                            // supplied by this branch doesn't entirely match all the previous ones.
                            keepExploring &= Process(node.Child(i), ref alternateSb);
                            keepExploring &= alternateSb.Length == addedLength;

                            addedLength = Math.Min(addedLength, alternateSb.Length);
                            for (int j = 0; j < addedLength; j++)
                            {
                                if (vsb[initialLength + j] != alternateSb[j])
                                {
                                    addedLength   = j;
                                    keepExploring = false;
                                    break;
                                }
                            }
                        }

                        alternateSb.Dispose();

                        // Then cull back on what was added based on the other branches.
                        vsb.Length = initialLength + addedLength;
                    }

                    return(!rtl && keepExploring);
                }

                // One character
                case RegexNode.One when(node.Options& RegexOptions.IgnoreCase) == 0:
                    vsb.Append(node.Ch);

                    return(!rtl);

                // Multiple characters
                case RegexNode.Multi when(node.Options& RegexOptions.IgnoreCase) == 0:
                    vsb.Append(node.Str);

                    return(!rtl);

                // Loop of one character
                case RegexNode.Oneloop or RegexNode.Oneloopatomic or RegexNode.Onelazy when node.M > 0 && (node.Options & RegexOptions.IgnoreCase) == 0:
                    const int SingleCharIterationLimit = 32;     // arbitrary cut-off to avoid creating super long strings unnecessarily
                    int       count = Math.Min(node.M, SingleCharIterationLimit);
                    vsb.Append(node.Ch, count);
                    return(count == node.N && !rtl);

                // Loop of a node
                case RegexNode.Loop or RegexNode.Lazyloop when node.M > 0:
                {
                    const int NodeIterationLimit = 4;         // arbitrary cut-off to avoid creating super long strings unnecessarily
                    int       limit = Math.Min(node.M, NodeIterationLimit);
                    for (int i = 0; i < limit; i++)
                    {
                        if (!Process(node.Child(0), ref vsb))
                        {
                            return(false);
                        }
                    }
                    return(limit == node.N && !rtl);
                }

                // Grouping nodes for which we only care about their single child
                case RegexNode.Atomic:
                case RegexNode.Capture:
                    return(Process(node.Child(0), ref vsb));

                // Zero-width anchors and assertions
                case RegexNode.Bol:
                case RegexNode.Eol:
                case RegexNode.Boundary:
                case RegexNode.ECMABoundary:
                case RegexNode.NonBoundary:
                case RegexNode.NonECMABoundary:
                case RegexNode.Beginning:
                case RegexNode.Start:
                case RegexNode.EndZ:
                case RegexNode.End:
                case RegexNode.Empty:
                case RegexNode.UpdateBumpalong:
                case RegexNode.Require:
                case RegexNode.Prevent:
                    return(true);

                // Give up for anything else
                default:
                    return(false);
                }
            }