static bool TryAnalyze(RegexNode node, AnalysisResults results, bool isAtomicByAncestor, bool isInLoop) { if (!StackHelper.TryEnsureSufficientExecutionStack()) { return(false); } // Track whether we've seen any nodes with various options set. results._hasIgnoreCase |= (node.Options & RegexOptions.IgnoreCase) != 0; results._hasRightToLeft |= (node.Options & RegexOptions.RightToLeft) != 0; // Track whether this node is inside of a loop. if (isInLoop) { (results._inLoops ??= new HashSet <RegexNode>()).Add(node); } if (isAtomicByAncestor) { // We've been told by our parent that we should be considered atomic, so add ourselves // to the atomic collection. results._isAtomicByAncestor.Add(node); } else { // Certain kinds of nodes incur backtracking logic themselves: add them to the backtracking collection. // We may later find that a node contains another that has backtracking; we'll add nodes based on that // after examining the children. switch (node.Kind) { case RegexNodeKind.Alternate: case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when node.M != node.N: case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop or RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy when node.M != node.N: (results._mayBacktrack ??= new HashSet <RegexNode>()).Add(node); break; } } // Update state for certain node types. bool isAtomicBySelf = false; switch (node.Kind) { // Some node types add atomicity around what they wrap. Set isAtomicBySelfOrParent to true for such nodes // even if it was false upon entering the method. case RegexNodeKind.Atomic: case RegexNodeKind.NegativeLookaround: case RegexNodeKind.PositiveLookaround: isAtomicBySelf = true; break; // Track any nodes that are themselves captures. case RegexNodeKind.Capture: results._containsCapture.Add(node); break; // Track whether we've recurred into a loop case RegexNodeKind.Loop: case RegexNodeKind.Lazyloop: isInLoop = true; break; } // Process each child. int childCount = node.ChildCount(); for (int i = 0; i < childCount; i++) { RegexNode child = node.Child(i); // Determine whether the child should be treated as atomic (whether anything // can backtrack into it), which is influenced by whether this node (the child's // parent) is considered atomic by itself or by its parent. bool treatChildAsAtomic = (isAtomicByAncestor | isAtomicBySelf) && node.Kind switch { // If the parent is atomic, so is the child. That's the whole purpose // of the Atomic node, and lookarounds are also implicitly atomic. RegexNodeKind.Atomic or RegexNodeKind.NegativeLookaround or RegexNodeKind.PositiveLookaround => true, // Each branch is considered independently, so any atomicity applied to the alternation also applies // to each individual branch. This is true as well for conditionals. RegexNodeKind.Alternate or RegexNodeKind.BackreferenceConditional or RegexNodeKind.ExpressionConditional => true, // Captures don't impact atomicity: if the parent of a capture is atomic, the capture is also atomic. RegexNodeKind.Capture => true, // If the parent is a concatenation and this is the last node, any atomicity // applying to the concatenation applies to this node, too. RegexNodeKind.Concatenate => i == childCount - 1, // For loops with a max iteration count of 1, they themselves can be considered // atomic as can whatever they wrap, as they won't ever iterate more than once // and thus we don't need to worry about one iteration consuming input destined // for a subsequent iteration. RegexNodeKind.Loop or RegexNodeKind.Lazyloop when node.N == 1 => true, // For any other parent type, give up on trying to prove atomicity. _ => false, }; // Now analyze the child. if (!TryAnalyze(child, results, treatChildAsAtomic, isInLoop)) { return(false); } // If the child contains captures, so too does this parent. if (results._containsCapture.Contains(child)) { results._containsCapture.Add(node); } // If the child might require backtracking into it, so too might the parent, // unless the parent is itself considered atomic. Here we don't consider parental // atomicity, as we need to surface upwards to the parent whether any backtracking // will be visible from this node to it. if (!isAtomicBySelf && (results._mayBacktrack?.Contains(child) == true)) { (results._mayBacktrack ??= new HashSet <RegexNode>()).Add(node); } } // Successfully analyzed the node. return(true); }
// Processes the node, adding any prefix text to the builder. // Returns whether processing should continue with subsequent nodes. static bool Process(RegexNode node, ref ValueStringBuilder vsb) { if (!StackHelper.TryEnsureSufficientExecutionStack()) { // If we're too deep on the stack, just give up finding any more prefix. return(false); } // We don't bother to handle reversed input, so process at most one node // when handling RightToLeft. bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; switch (node.Type) { // Concatenation case RegexNode.Concatenate: { int childCount = node.ChildCount(); for (int i = 0; i < childCount; i++) { if (!Process(node.Child(i), ref vsb)) { return(false); } } return(!rtl); } // Alternation: find a string that's a shared prefix of all branches case RegexNode.Alternate: { int childCount = node.ChildCount(); // Store the initial branch into the target builder int initialLength = vsb.Length; bool keepExploring = Process(node.Child(0), ref vsb); int addedLength = vsb.Length - initialLength; // Then explore the rest of the branches, finding the length // a prefix they all share in common with the initial branch. if (addedLength != 0) { var alternateSb = new ValueStringBuilder(64); // Process each branch. If we reach a point where we've proven there's // no overlap, we can bail early. for (int i = 1; i < childCount && addedLength != 0; i++) { alternateSb.Length = 0; // Process the branch. We want to keep exploring after this alternation, // but we can't if either this branch doesn't allow for it or if the prefix // supplied by this branch doesn't entirely match all the previous ones. keepExploring &= Process(node.Child(i), ref alternateSb); keepExploring &= alternateSb.Length == addedLength; addedLength = Math.Min(addedLength, alternateSb.Length); for (int j = 0; j < addedLength; j++) { if (vsb[initialLength + j] != alternateSb[j]) { addedLength = j; keepExploring = false; break; } } } alternateSb.Dispose(); // Then cull back on what was added based on the other branches. vsb.Length = initialLength + addedLength; } return(!rtl && keepExploring); } // One character case RegexNode.One when(node.Options& RegexOptions.IgnoreCase) == 0: vsb.Append(node.Ch); return(!rtl); // Multiple characters case RegexNode.Multi when(node.Options& RegexOptions.IgnoreCase) == 0: vsb.Append(node.Str); return(!rtl); // Loop of one character case RegexNode.Oneloop or RegexNode.Oneloopatomic or RegexNode.Onelazy when node.M > 0 && (node.Options & RegexOptions.IgnoreCase) == 0: const int SingleCharIterationLimit = 32; // arbitrary cut-off to avoid creating super long strings unnecessarily int count = Math.Min(node.M, SingleCharIterationLimit); vsb.Append(node.Ch, count); return(count == node.N && !rtl); // Loop of a node case RegexNode.Loop or RegexNode.Lazyloop when node.M > 0: { const int NodeIterationLimit = 4; // arbitrary cut-off to avoid creating super long strings unnecessarily int limit = Math.Min(node.M, NodeIterationLimit); for (int i = 0; i < limit; i++) { if (!Process(node.Child(0), ref vsb)) { return(false); } } return(limit == node.N && !rtl); } // Grouping nodes for which we only care about their single child case RegexNode.Atomic: case RegexNode.Capture: return(Process(node.Child(0), ref vsb)); // Zero-width anchors and assertions case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.ECMABoundary: case RegexNode.NonBoundary: case RegexNode.NonECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: case RegexNode.Empty: case RegexNode.UpdateBumpalong: case RegexNode.Require: case RegexNode.Prevent: return(true); // Give up for anything else default: return(false); } }