internal RegexReplacement(string rep, RegexNode concat, System.Collections.Generic.Dictionary<object,object> _caps) { this._rep = rep; if (concat.Type() != 0x19) { throw new ArgumentException(RegExRes.GetString(0x25)); } StringBuilder builder = new StringBuilder(); ArrayList list = new ArrayList(); ArrayList list2 = new ArrayList(); for (int i = 0; i < concat.ChildCount(); i++) { RegexNode node = concat.Child(i); switch (node.Type()) { case 9: { builder.Append(node._ch); continue; } case 12: { builder.Append(node._str); continue; } case 13: { if (builder.Length > 0) { list2.Add(list.Count); list.Add(builder.ToString()); builder.Length = 0; } int num = node._m; if ((_caps != null) && (num >= 0)) { num = (int) _caps[num]; } list2.Add(-5 - num); continue; } } throw new ArgumentException(RegExRes.GetString(0x25)); } if (builder.Length > 0) { list2.Add(list.Count); list.Add(builder.ToString()); } this._strings = new string[list.Count]; list.CopyTo(0, this._strings, 0, list.Count); this._rules = new int[list2.Count]; for (int j = 0; j < list2.Count; j++) { this._rules[j] = (int) list2[j]; } }
/* * Since RegexReplacement shares the same parser as Regex, * the constructor takes a RegexNode which is a concatenation * of constant strings and backreferences. */ internal RegexReplacement(String rep, RegexNode concat, Dictionary<Int32, Int32> _caps) { StringBuilder sb; List<String> strings; List<Int32> rules; int slot; _rep = rep; if (concat.Type() != RegexNode.Concatenate) throw new ArgumentException(SR.ReplacementError); sb = new StringBuilder(); strings = new List<String>(); rules = new List<Int32>(); for (int i = 0; i < concat.ChildCount(); i++) { RegexNode child = concat.Child(i); switch (child.Type()) { case RegexNode.Multi: sb.Append(child._str); break; case RegexNode.One: sb.Append(child._ch); break; case RegexNode.Ref: if (sb.Length > 0) { rules.Add(strings.Count); strings.Add(sb.ToString()); sb.Length = 0; } slot = child._m; if (_caps != null && slot >= 0) slot = (int)_caps[slot]; rules.Add(-Specials - 1 - slot); break; default: throw new ArgumentException(SR.ReplacementError); } } if (sb.Length > 0) { rules.Add(strings.Count); strings.Add(sb.ToString()); } _strings = strings; _rules = rules; }
private readonly List<string> _strings; // table of string constants #endregion Fields #region Constructors /// <summary> /// Since RegexReplacement shares the same parser as Regex, /// the constructor takes a RegexNode which is a concatenation /// of constant strings and backreferences. /// </summary> internal RegexReplacement(string rep, RegexNode concat, Hashtable _caps) { if (concat.Type() != RegexNode.Concatenate) throw new ArgumentException(SR.ReplacementError); StringBuilder sb = StringBuilderCache.Acquire(); List<string> strings = new List<string>(); List<int> rules = new List<int>(); for (int i = 0; i < concat.ChildCount(); i++) { RegexNode child = concat.Child(i); switch (child.Type()) { case RegexNode.Multi: sb.Append(child._str); break; case RegexNode.One: sb.Append(child._ch); break; case RegexNode.Ref: if (sb.Length > 0) { rules.Add(strings.Count); strings.Add(sb.ToString()); sb.Length = 0; } int slot = child._m; if (_caps != null && slot >= 0) slot = (int)_caps[slot]; rules.Add(-Specials - 1 - slot); break; default: throw new ArgumentException(SR.ReplacementError); } } if (sb.Length > 0) { rules.Add(strings.Count); strings.Add(sb.ToString()); } StringBuilderCache.Release(sb); _rep = rep; _strings = strings; _rules = rules; }
internal RegexReplacement(string rep, RegexNode concat, Hashtable _caps) { this._rep = rep; if (concat.Type() != 0x19) { throw new ArgumentException(SR.GetString("ReplacementError")); } StringBuilder builder = new StringBuilder(); List<string> list = new List<string>(); List<int> list2 = new List<int>(); for (int i = 0; i < concat.ChildCount(); i++) { RegexNode node = concat.Child(i); switch (node.Type()) { case 9: { builder.Append(node._ch); continue; } case 12: { builder.Append(node._str); continue; } case 13: { if (builder.Length > 0) { list2.Add(list.Count); list.Add(builder.ToString()); builder.Length = 0; } int num = node._m; if ((_caps != null) && (num >= 0)) { num = (int) _caps[num]; } list2.Add(-5 - num); continue; } } throw new ArgumentException(SR.GetString("ReplacementError")); } if (builder.Length > 0) { list2.Add(list.Count); list.Add(builder.ToString()); } this._strings = list; this._rules = list2; }
internal RegexNode ReduceRep() { RegexNode node = this; int num = this.Type(); int num2 = this._m; int num3 = this._n; while (true) { if (node.ChildCount() == 0) { break; } RegexNode node2 = node.Child(0); if (node2.Type() != num) { int num4 = node2.Type(); if ((((num4 < 3) || (num4 > 5)) || (num != 0x1a)) && (((num4 < 6) || (num4 > 8)) || (num != 0x1b))) { break; } } if (((node._m == 0) && (node2._m > 1)) || (node2._n < (node2._m * 2))) { break; } node = node2; if (node._m > 0) { node._m = num2 = ((0x7ffffffe / node._m) < num2) ? 0x7fffffff : (node._m * num2); } if (node._n > 0) { node._n = num3 = ((0x7ffffffe / node._n) < num3) ? 0x7fffffff : (node._n * num3); } } if (num2 != 0x7fffffff) { return(node); } return(new RegexNode(0x16, this._options)); }
/// <summary> /// The top level RegexCode generator. It does a depth-first walk /// through the tree and calls EmitFragment to emit code before /// and after each child of an interior node and at each leaf. /// It also computes various information about the tree, such as /// prefix data to help with optimizations. /// </summary> public RegexCode RegexCodeFromRegexTree(RegexTree tree, CultureInfo culture) { // Construct sparse capnum mapping if some numbers are unused. int capsize; if (tree.CapNumList == null || tree.CapTop == tree.CapNumList.Length) { capsize = tree.CapTop; _caps = null; } else { capsize = tree.CapNumList.Length; _caps = tree.Caps; for (int i = 0; i < tree.CapNumList.Length; i++) { _caps[tree.CapNumList[i]] = i; } } // Every written code begins with a lazy branch. This will be back-patched // to point to the ending Stop after the whole expression has been written. Emit(RegexOpcode.Lazybranch, 0); // Emit every node. RegexNode curNode = tree.Root; int curChild = 0; while (true) { int curNodeChildCount = curNode.ChildCount(); if (curNodeChildCount == 0) { EmitFragment(curNode.Kind, curNode, 0); } else if (curChild < curNodeChildCount) { EmitFragment(curNode.Kind | BeforeChild, curNode, curChild); curNode = curNode.Child(curChild); _intStack.Append(curChild); curChild = 0; continue; } if (_intStack.Length == 0) { break; } curChild = _intStack.Pop(); curNode = curNode.Parent !; EmitFragment(curNode.Kind | AfterChild, curNode, curChild); curChild++; } // Patch the starting Lazybranch, emit the final Stop, and get the resulting code array. PatchJump(0, _emitted.Length); Emit(RegexOpcode.Stop); int[] emitted = _emitted.AsSpan().ToArray(); // Convert the string table into an ordered string array. var strings = new string[_stringTable.Count]; foreach (KeyValuePair <string, int> stringEntry in _stringTable) { strings[stringEntry.Value] = stringEntry.Key; } // Return all that in a RegexCode object. return(new RegexCode(tree, culture, emitted, strings, _trackCount, _caps, capsize)); }
/* * Since RegexReplacement shares the same parser as Regex, * the constructor takes a RegexNode which is a concatenation * of constant strings and backreferences. */ internal RegexReplacement(String rep, RegexNode concat, Hashtable _caps) { StringBuilder sb; ArrayList strings; ArrayList rules; int slot; _rep = rep; if (concat.Type() != RegexNode.Concatenate) { throw new ArgumentException(SR.GetString(SR.ReplacementError)); } sb = new StringBuilder(); strings = new ArrayList(); rules = new ArrayList(); for (int i = 0; i < concat.ChildCount(); i++) { RegexNode child = concat.Child(i); switch (child.Type()) { case RegexNode.Multi: sb.Append(child._str); break; case RegexNode.One: sb.Append(child._ch); break; case RegexNode.Ref: if (sb.Length > 0) { rules.Add(strings.Count); strings.Add(sb.ToString()); sb.Length = 0; } slot = child._m; if (_caps != null && slot >= 0) { slot = (int)_caps[slot]; } rules.Add(-Specials - 1 - slot); break; default: throw new ArgumentException(SR.GetString(SR.ReplacementError)); } } if (sb.Length > 0) { rules.Add(strings.Count); strings.Add(sb.ToString()); } _strings = strings; _rules = rules; }
/* * Remember the pushed state (in response to a ')') */ internal void PopGroup() { _concatenation = _stack; _alternation = _concatenation._next; _group = _alternation._next; _stack = _group._next; // The first () inside a Testgroup group goes directly to the group if (_group.Type() == RegexNode.Testgroup && _group.ChildCount() == 0) { if (_unit == null) throw MakeException(SR.IllegalCondition); _group.AddChild(_unit); _unit = null; } }
private readonly int[] _rules; // negative -> group #, positive -> string # /// <summary> /// Since RegexReplacement shares the same parser as Regex, /// the constructor takes a RegexNode which is a concatenation /// of constant strings and backreferences. /// </summary> public RegexReplacement(string rep, RegexNode concat, Hashtable _caps) { if (concat.Type != RegexNode.Concatenate) { throw ThrowHelper.CreateArgumentException(ExceptionResource.ReplacementError); } Span <char> vsbStack = stackalloc char[256]; var vsb = new ValueStringBuilder(vsbStack); FourStackStrings stackStrings = default; var strings = new ValueListBuilder <string>(MemoryMarshal.CreateSpan(ref stackStrings.Item1 !, 4)); var rules = new ValueListBuilder <int>(stackalloc int[64]); int childCount = concat.ChildCount(); for (int i = 0; i < childCount; i++) { RegexNode child = concat.Child(i); switch (child.Type) { case RegexNode.Multi: vsb.Append(child.Str !); break; case RegexNode.One: vsb.Append(child.Ch); break; case RegexNode.Ref: if (vsb.Length > 0) { rules.Append(strings.Length); strings.Append(vsb.ToString()); vsb = new ValueStringBuilder(vsbStack); } int slot = child.M; if (_caps != null && slot >= 0) { slot = (int)_caps[slot] !; } rules.Append(-Specials - 1 - slot); break; default: throw ThrowHelper.CreateArgumentException(ExceptionResource.ReplacementError); } } if (vsb.Length > 0) { rules.Append(strings.Length); strings.Append(vsb.ToString()); } Pattern = rep; _strings = strings.AsSpan().ToArray(); _rules = rules.AsSpan().ToArray(); rules.Dispose(); }
/// <summary>Computes the leading substring in <paramref name="tree"/>.</summary> /// <remarks>It's quite trivial and gives up easily, in which case an empty string is returned.</remarks> public static (string Prefix, bool CaseInsensitive) ComputeLeadingSubstring(RegexTree tree) { RegexNode curNode = tree.Root; RegexNode?concatNode = null; int nextChild = 0; while (true) { switch (curNode.Type) { case RegexNode.Concatenate: if (curNode.ChildCount() > 0) { concatNode = curNode; nextChild = 0; } break; case RegexNode.Atomic: case RegexNode.Capture: curNode = curNode.Child(0); concatNode = null; continue; case RegexNode.Oneloop: case RegexNode.Oneloopatomic: case RegexNode.Onelazy: // In release, cutoff at a length to which we can still reasonably construct a string and Boyer-Moore search. // In debug, use a smaller cutoff to exercise the cutoff path in tests const int Cutoff = #if DEBUG 50; #else RegexBoyerMoore.MaxLimit; #endif if (curNode.M > 0 && curNode.M < Cutoff) { return(new string(curNode.Ch, curNode.M), (curNode.Options & RegexOptions.IgnoreCase) != 0); } return(string.Empty, false); case RegexNode.One: return(curNode.Ch.ToString(), (curNode.Options & RegexOptions.IgnoreCase) != 0); case RegexNode.Multi: return(curNode.Str !, (curNode.Options & RegexOptions.IgnoreCase) != 0); case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.ECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: case RegexNode.Empty: case RegexNode.Require: case RegexNode.Prevent: break; default: return(string.Empty, false); } if (concatNode == null || nextChild >= concatNode.ChildCount()) { return(string.Empty, false); } curNode = concatNode.Child(nextChild++); } }
/// <summary> /// The main RegexCode generator. It does a depth-first walk /// through the tree and calls EmitFragment to emits code before /// and after each child of an interior node, and at each leaf. /// </summary> private void EmitFragment(int nodetype, RegexNode node, int curIndex) { int bits = 0; if (node.UseOptionR()) { bits |= RegexCode.Rtl; } if ((node.Options & RegexOptions.IgnoreCase) != 0) { bits |= RegexCode.Ci; } switch (nodetype) { case RegexNode.Concatenate | BeforeChild: case RegexNode.Concatenate | AfterChild: case RegexNode.Empty: break; case RegexNode.Alternate | BeforeChild: if (curIndex < node.ChildCount() - 1) { _intStack.Append(_emitted.Length); Emit(RegexCode.Lazybranch, 0); } break; case RegexNode.Alternate | AfterChild: { if (curIndex < node.ChildCount() - 1) { int LBPos = _intStack.Pop(); _intStack.Append(_emitted.Length); Emit(RegexCode.Goto, 0); PatchJump(LBPos, _emitted.Length); } else { int I; for (I = 0; I < curIndex; I++) { PatchJump(_intStack.Pop(), _emitted.Length); } } break; } case RegexNode.Testref | BeforeChild: switch (curIndex) { case 0: Emit(RegexCode.Setjump); _intStack.Append(_emitted.Length); Emit(RegexCode.Lazybranch, 0); Emit(RegexCode.Testref, MapCapnum(node.M)); Emit(RegexCode.Forejump); break; } break; case RegexNode.Testref | AfterChild: switch (curIndex) { case 0: { int Branchpos = _intStack.Pop(); _intStack.Append(_emitted.Length); Emit(RegexCode.Goto, 0); PatchJump(Branchpos, _emitted.Length); Emit(RegexCode.Forejump); if (node.ChildCount() > 1) { break; } // else fallthrough goto case 1; } case 1: PatchJump(_intStack.Pop(), _emitted.Length); break; } break; case RegexNode.Testgroup | BeforeChild: switch (curIndex) { case 0: Emit(RegexCode.Setjump); Emit(RegexCode.Setmark); _intStack.Append(_emitted.Length); Emit(RegexCode.Lazybranch, 0); break; } break; case RegexNode.Testgroup | AfterChild: switch (curIndex) { case 0: Emit(RegexCode.Getmark); Emit(RegexCode.Forejump); break; case 1: int Branchpos = _intStack.Pop(); _intStack.Append(_emitted.Length); Emit(RegexCode.Goto, 0); PatchJump(Branchpos, _emitted.Length); Emit(RegexCode.Getmark); Emit(RegexCode.Forejump); if (node.ChildCount() > 2) { break; } // else fallthrough goto case 2; case 2: PatchJump(_intStack.Pop(), _emitted.Length); break; } break; case RegexNode.Loop | BeforeChild: case RegexNode.Lazyloop | BeforeChild: if (node.N < int.MaxValue || node.M > 1) { Emit(node.M == 0 ? RegexCode.Nullcount : RegexCode.Setcount, node.M == 0 ? 0 : 1 - node.M); } else { Emit(node.M == 0 ? RegexCode.Nullmark : RegexCode.Setmark); } if (node.M == 0) { _intStack.Append(_emitted.Length); Emit(RegexCode.Goto, 0); } _intStack.Append(_emitted.Length); break; case RegexNode.Loop | AfterChild: case RegexNode.Lazyloop | AfterChild: { int StartJumpPos = _emitted.Length; int Lazy = (nodetype - (RegexNode.Loop | AfterChild)); if (node.N < int.MaxValue || node.M > 1) { Emit(RegexCode.Branchcount + Lazy, _intStack.Pop(), node.N == int.MaxValue ? int.MaxValue : node.N - node.M); } else { Emit(RegexCode.Branchmark + Lazy, _intStack.Pop()); } if (node.M == 0) { PatchJump(_intStack.Pop(), StartJumpPos); } } break; case RegexNode.Group | BeforeChild: case RegexNode.Group | AfterChild: break; case RegexNode.Capture | BeforeChild: Emit(RegexCode.Setmark); break; case RegexNode.Capture | AfterChild: Emit(RegexCode.Capturemark, MapCapnum(node.M), MapCapnum(node.N)); break; case RegexNode.Require | BeforeChild: // NOTE: the following line causes lookahead/lookbehind to be // NON-BACKTRACKING. It can be commented out with (*) Emit(RegexCode.Setjump); Emit(RegexCode.Setmark); break; case RegexNode.Require | AfterChild: Emit(RegexCode.Getmark); // NOTE: the following line causes lookahead/lookbehind to be // NON-BACKTRACKING. It can be commented out with (*) Emit(RegexCode.Forejump); break; case RegexNode.Prevent | BeforeChild: Emit(RegexCode.Setjump); _intStack.Append(_emitted.Length); Emit(RegexCode.Lazybranch, 0); break; case RegexNode.Prevent | AfterChild: Emit(RegexCode.Backjump); PatchJump(_intStack.Pop(), _emitted.Length); Emit(RegexCode.Forejump); break; case RegexNode.Atomic | BeforeChild: Emit(RegexCode.Setjump); break; case RegexNode.Atomic | AfterChild: Emit(RegexCode.Forejump); break; case RegexNode.One: case RegexNode.Notone: Emit(node.Type | bits, node.Ch); break; case RegexNode.Notoneloop: case RegexNode.Notoneloopatomic: case RegexNode.Notonelazy: case RegexNode.Oneloop: case RegexNode.Oneloopatomic: case RegexNode.Onelazy: if (node.M > 0) { Emit(((node.Type == RegexNode.Oneloop || node.Type == RegexNode.Oneloopatomic || node.Type == RegexNode.Onelazy) ? RegexCode.Onerep : RegexCode.Notonerep) | bits, node.Ch, node.M); } if (node.N > node.M) { Emit(node.Type | bits, node.Ch, node.N == int.MaxValue ? int.MaxValue : node.N - node.M); } break; case RegexNode.Setloop: case RegexNode.Setloopatomic: case RegexNode.Setlazy: { int stringCode = StringCode(node.Str !); if (node.M > 0) { Emit(RegexCode.Setrep | bits, stringCode, node.M); } if (node.N > node.M) { Emit(node.Type | bits, stringCode, (node.N == int.MaxValue) ? int.MaxValue : node.N - node.M); } } break; case RegexNode.Multi: Emit(node.Type | bits, StringCode(node.Str !)); break; case RegexNode.Set: Emit(node.Type | bits, StringCode(node.Str !)); break; case RegexNode.Ref: Emit(node.Type | bits, MapCapnum(node.M)); break; case RegexNode.Nothing: case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.Nonboundary: case RegexNode.ECMABoundary: case RegexNode.NonECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: Emit(node.Type); break; default: throw new ArgumentException(SR.Format(SR.UnexpectedOpcode, nodetype.ToString())); } }
internal static RegexPrefix ScanChars(RegexTree tree) { RegexNode node2 = null; int num = 0; string prefix = null; bool ci = false; RegexNode node = tree._root; Label_0010: switch (node._type) { case 3: case 6: if (node._n == 0x7fffffff) { prefix = RegexCharClass.SetFromChar(node._ch); ci = RegexOptions.None != (node._options & RegexOptions.IgnoreCase); break; } return(null); case 4: case 7: if (node._n == 0x7fffffff) { prefix = RegexCharClass.SetInverseFromChar(node._ch); ci = RegexOptions.None != (node._options & RegexOptions.IgnoreCase); break; } return(null); case 5: case 8: if ((node._n == 0x7fffffff) && ((node._str2 == null) || (node._str2.Length == 0))) { prefix = node._str; ci = RegexOptions.None != (node._options & RegexOptions.IgnoreCase); break; } return(null); case 14: case 15: case 0x10: case 0x12: case 0x13: case 20: case 0x15: case 0x17: case 30: case 0x1f: case 0x29: break; case 0x19: if (node.ChildCount() > 0) { node2 = node; num = 0; } break; case 0x1c: case 0x20: node = node.Child(0); node2 = null; goto Label_0010; default: return(null); } if (prefix != null) { return(new RegexPrefix(prefix, ci)); } if ((node2 == null) || (num >= node2.ChildCount())) { return(null); } node = node2.Child(num++); goto Label_0010; }
private bool _hasBackreferences; // true if the replacement has any backreferences; otherwise, false /// <summary> /// Since RegexReplacement shares the same parser as Regex, /// the constructor takes a RegexNode which is a concatenation /// of constant strings and backreferences. /// </summary> public RegexReplacement(string rep, RegexNode concat, Hashtable _caps) { Debug.Assert(concat.Kind == RegexNodeKind.Concatenate, $"Expected Concatenate, got {concat.Kind}"); var vsb = new ValueStringBuilder(stackalloc char[256]); FourStackStrings stackStrings = default; var strings = new ValueListBuilder <string>(MemoryMarshal.CreateSpan(ref stackStrings.Item1 !, 4)); var rules = new ValueListBuilder <int>(stackalloc int[64]); int childCount = concat.ChildCount(); for (int i = 0; i < childCount; i++) { RegexNode child = concat.Child(i); switch (child.Kind) { case RegexNodeKind.Multi: vsb.Append(child.Str !); break; case RegexNodeKind.One: vsb.Append(child.Ch); break; case RegexNodeKind.Backreference: if (vsb.Length > 0) { rules.Append(strings.Length); strings.Append(vsb.AsSpan().ToString()); vsb.Length = 0; } int slot = child.M; if (_caps != null && slot >= 0) { slot = (int)_caps[slot] !; } rules.Append(-Specials - 1 - slot); _hasBackreferences = true; break; default: Debug.Fail($"Unexpected child kind {child.Kind}"); break; } } if (vsb.Length > 0) { rules.Append(strings.Length); strings.Append(vsb.ToString()); } vsb.Dispose(); Pattern = rep; _strings = strings.AsSpan().ToArray(); _rules = rules.AsSpan().ToArray(); rules.Dispose(); }
/* * This is a related computation: it takes a RegexTree and computes the * leading substring if it see one. It's quite trivial and gives up easily. */ internal static RegexPrefix Prefix(RegexTree tree) { RegexNode curNode; RegexNode concatNode = null; int nextChild = 0; curNode = tree._root; for (; ;) { switch (curNode._type) { case RegexNode.Concatenate: if (curNode.ChildCount() > 0) { concatNode = curNode; nextChild = 0; } break; case RegexNode.Greedy: case RegexNode.Capture: curNode = curNode.Child(0); concatNode = null; continue; case RegexNode.Oneloop: case RegexNode.Onelazy: if (curNode._m > 0) { string pref = String.Empty.PadRight(curNode._m, curNode._ch); return(new RegexPrefix(pref, 0 != (curNode._options & RegexOptions.IgnoreCase))); } else { return(RegexPrefix.Empty); } case RegexNode.One: return(new RegexPrefix(curNode._ch.ToString(), 0 != (curNode._options & RegexOptions.IgnoreCase))); case RegexNode.Multi: return(new RegexPrefix(curNode._str, 0 != (curNode._options & RegexOptions.IgnoreCase))); case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.ECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: case RegexNode.Empty: case RegexNode.Require: case RegexNode.Prevent: break; default: return(RegexPrefix.Empty); } if (concatNode == null || nextChild >= concatNode.ChildCount()) { return(RegexPrefix.Empty); } curNode = concatNode.Child(nextChild++); } }
/// <summary> /// The top level RegexCode generator. It does a depth-first walk /// through the tree and calls EmitFragment to emit code before /// and after each child of an interior node and at each leaf. /// It also computes various information about the tree, such as /// prefix data to help with optimizations. /// </summary> public RegexCode RegexCodeFromRegexTree(RegexTree tree) { // Construct sparse capnum mapping if some numbers are unused. int capsize; if (tree.CapNumList == null || tree.CapTop == tree.CapNumList.Length) { capsize = tree.CapTop; _caps = null; } else { capsize = tree.CapNumList.Length; _caps = tree.Caps; for (int i = 0; i < tree.CapNumList.Length; i++) { _caps[tree.CapNumList[i]] = i; } } // Every written code begins with a lazy branch. This will be back-patched // to point to the ending Stop after the whole expression has been written. Emit(RegexCode.Lazybranch, 0); // Emit every node. RegexNode curNode = tree.Root; int curChild = 0; while (true) { int curNodeChildCount = curNode.ChildCount(); if (curNodeChildCount == 0) { EmitFragment(curNode.Type, curNode, 0); } else if (curChild < curNodeChildCount) { EmitFragment(curNode.Type | BeforeChild, curNode, curChild); curNode = curNode.Child(curChild); _intStack.Append(curChild); curChild = 0; continue; } if (_intStack.Length == 0) { break; } curChild = _intStack.Pop(); curNode = curNode.Next !; EmitFragment(curNode.Type | AfterChild, curNode, curChild); curChild++; } // Patch the starting Lazybranch, emit the final Stop, and get the resulting code array. PatchJump(0, _emitted.Length); Emit(RegexCode.Stop); int[] emitted = _emitted.AsSpan().ToArray(); bool rtl = (tree.Options & RegexOptions.RightToLeft) != 0; // Compute prefixes to help optimize FindFirstChar. RegexBoyerMoore?bmPrefix = null; RegexPrefix? fcPrefix = null; RegexPrefix prefix = RegexFCD.Prefix(tree); if (prefix.Prefix.Length > 1 && prefix.Prefix.Length <= RegexBoyerMoore.MaxLimit) // if it's <= 1 || > MaxLimit, perf is better using fcPrefix { // Compute a Boyer-Moore prefix if we find a single string of sufficient length that always begins the expression. CultureInfo culture = (tree.Options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; bmPrefix = new RegexBoyerMoore(prefix.Prefix, prefix.CaseInsensitive, rtl, culture); } else { // If we didn't find such a string, try to compute the characters set that might begin the string. fcPrefix = RegexFCD.FirstChars(tree); } // Compute any anchors starting the expression. int anchors = RegexFCD.Anchors(tree); // Convert the string table into an ordered string array/ var strings = new string[_stringTable.Count]; foreach (KeyValuePair <string, int> stringEntry in _stringTable) { strings[stringEntry.Value] = stringEntry.Key; } // Return all that in a RegexCode object. return(new RegexCode(tree, emitted, strings, _trackCount, _caps, capsize, bmPrefix, fcPrefix, anchors, rtl)); }
/* * This is a related computation: it takes a RegexTree and computes the * leading substring if it see one. It's quite trivial and gives up easily. */ internal static RegexPrefix Prefix(RegexTree tree) { RegexNode curNode; RegexNode concatNode = null; int nextChild = 0; curNode = tree._root; for (;;) { switch (curNode._type) { case RegexNode.Concatenate: if (curNode.ChildCount() > 0) { concatNode = curNode; nextChild = 0; } break; case RegexNode.Greedy: case RegexNode.Capture: curNode = curNode.Child(0); concatNode = null; continue; case RegexNode.Oneloop: case RegexNode.Onelazy: case RegexNode.Multi: goto OuterloopBreak; case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: #if ECMA case RegexNode.ECMABoundary: #endif case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: case RegexNode.Empty: case RegexNode.Require: case RegexNode.Prevent: break; default: return(RegexPrefix.Empty); } if (concatNode == null || nextChild >= concatNode.ChildCount()) { return(RegexPrefix.Empty); } curNode = concatNode.Child(nextChild++); } OuterloopBreak: ; switch (curNode._type) { case RegexNode.Multi: return(new RegexPrefix(curNode._str, 0 != (curNode._options & RegexOptions.IgnoreCase))); case RegexNode.Oneloop: goto case RegexNode.Onelazy; case RegexNode.Onelazy: if (curNode._m > 0) { StringBuilder sb = new StringBuilder(); sb.Append(curNode._ch, curNode._m); return(new RegexPrefix(sb.ToString(), 0 != (curNode._options & RegexOptions.IgnoreCase))); } // else fallthrough goto default; default: return(RegexPrefix.Empty); } }
/* * This is a related computation: it takes a RegexTree and computes the * leading []* construct if it see one. It's quite trivial and gives up easily. */ internal static RegexPrefix ScanChars(RegexTree tree) { RegexNode curNode; RegexNode concatNode = null; int nextChild = 0; String foundSet = null; bool caseInsensitive = false; curNode = tree._root; for (;;) { switch (curNode._type) { case RegexNode.Concatenate: if (curNode.ChildCount() > 0) { concatNode = curNode; nextChild = 0; } break; case RegexNode.Greedy: case RegexNode.Capture: curNode = curNode.Child(0); concatNode = null; continue; case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: #if ECMA case RegexNode.ECMABoundary: #endif case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: case RegexNode.Empty: case RegexNode.Require: case RegexNode.Prevent: break; case RegexNode.Oneloop: case RegexNode.Onelazy: if (curNode._n != infinite) { return(null); } foundSet = RegexCharClass.SetFromChar(curNode._ch); caseInsensitive = (0 != (curNode._options & RegexOptions.IgnoreCase)); break; case RegexNode.Notoneloop: case RegexNode.Notonelazy: if (curNode._n != infinite) { return(null); } foundSet = RegexCharClass.SetInverseFromChar(curNode._ch); caseInsensitive = (0 != (curNode._options & RegexOptions.IgnoreCase)); break; case RegexNode.Setloop: case RegexNode.Setlazy: if (curNode._n != infinite || (curNode._str2 != null && curNode._str2.Length != 0)) { return(null); } foundSet = curNode._str; caseInsensitive = (0 != (curNode._options & RegexOptions.IgnoreCase)); break; default: return(null); } if (foundSet != null) { return(new RegexPrefix(foundSet, caseInsensitive)); } if (concatNode == null || nextChild >= concatNode.ChildCount()) { return(null); } curNode = concatNode.Child(nextChild++); } }
/// <summary> /// This is a related computation: it takes a RegexTree and computes the /// leading substring if it see one. It's quite trivial and gives up easily. /// </summary> public static RegexPrefix Prefix(RegexTree tree) { RegexNode curNode = tree.Root; RegexNode?concatNode = null; int nextChild = 0; while (true) { switch (curNode.Type) { case RegexNode.Concatenate: if (curNode.ChildCount() > 0) { concatNode = curNode; nextChild = 0; } break; case RegexNode.Atomic: case RegexNode.Capture: curNode = curNode.Child(0); concatNode = null; continue; case RegexNode.Oneloop: case RegexNode.Oneloopatomic: case RegexNode.Onelazy: // In release, cutoff at a length to which we can still reasonably construct a string // In debug, use a smaller cutoff to exercise the cutoff path in tests const int Cutoff = #if DEBUG 50; #else 1_000_000; #endif if (curNode.M > 0 && curNode.M < Cutoff) { string pref = new string(curNode.Ch, curNode.M); return(new RegexPrefix(pref, 0 != (curNode.Options & RegexOptions.IgnoreCase))); } return(RegexPrefix.Empty); case RegexNode.One: return(new RegexPrefix(curNode.Ch.ToString(), 0 != (curNode.Options & RegexOptions.IgnoreCase))); case RegexNode.Multi: return(new RegexPrefix(curNode.Str !, 0 != (curNode.Options & RegexOptions.IgnoreCase))); case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.ECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: case RegexNode.Empty: case RegexNode.Require: case RegexNode.Prevent: break; default: return(RegexPrefix.Empty); } if (concatNode == null || nextChild >= concatNode.ChildCount()) { return(RegexPrefix.Empty); } curNode = concatNode.Child(nextChild++); } }
// Processes the node, adding any prefix text to the builder. // Returns whether processing should continue with subsequent nodes. static bool Process(RegexNode node, ref ValueStringBuilder vsb) { if (!StackHelper.TryEnsureSufficientExecutionStack()) { // If we're too deep on the stack, just give up finding any more prefix. return(false); } // We don't bother to handle reversed input, so process at most one node // when handling RightToLeft. bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; switch (node.Type) { // Concatenation case RegexNode.Concatenate: { int childCount = node.ChildCount(); for (int i = 0; i < childCount; i++) { if (!Process(node.Child(i), ref vsb)) { return(false); } } return(!rtl); } // Alternation: find a string that's a shared prefix of all branches case RegexNode.Alternate: { int childCount = node.ChildCount(); // Store the initial branch into the target builder int initialLength = vsb.Length; bool keepExploring = Process(node.Child(0), ref vsb); int addedLength = vsb.Length - initialLength; // Then explore the rest of the branches, finding the length // a prefix they all share in common with the initial branch. if (addedLength != 0) { var alternateSb = new ValueStringBuilder(64); // Process each branch. If we reach a point where we've proven there's // no overlap, we can bail early. for (int i = 1; i < childCount && addedLength != 0; i++) { alternateSb.Length = 0; // Process the branch. We want to keep exploring after this alternation, // but we can't if either this branch doesn't allow for it or if the prefix // supplied by this branch doesn't entirely match all the previous ones. keepExploring &= Process(node.Child(i), ref alternateSb); keepExploring &= alternateSb.Length == addedLength; addedLength = Math.Min(addedLength, alternateSb.Length); for (int j = 0; j < addedLength; j++) { if (vsb[initialLength + j] != alternateSb[j]) { addedLength = j; keepExploring = false; break; } } } alternateSb.Dispose(); // Then cull back on what was added based on the other branches. vsb.Length = initialLength + addedLength; } return(!rtl && keepExploring); } // One character case RegexNode.One when(node.Options& RegexOptions.IgnoreCase) == 0: vsb.Append(node.Ch); return(!rtl); // Multiple characters case RegexNode.Multi when(node.Options& RegexOptions.IgnoreCase) == 0: vsb.Append(node.Str); return(!rtl); // Loop of one character case RegexNode.Oneloop or RegexNode.Oneloopatomic or RegexNode.Onelazy when node.M > 0 && (node.Options & RegexOptions.IgnoreCase) == 0: const int SingleCharIterationLimit = 32; // arbitrary cut-off to avoid creating super long strings unnecessarily int count = Math.Min(node.M, SingleCharIterationLimit); vsb.Append(node.Ch, count); return(count == node.N && !rtl); // Loop of a node case RegexNode.Loop or RegexNode.Lazyloop when node.M > 0: { const int NodeIterationLimit = 4; // arbitrary cut-off to avoid creating super long strings unnecessarily int limit = Math.Min(node.M, NodeIterationLimit); for (int i = 0; i < limit; i++) { if (!Process(node.Child(0), ref vsb)) { return(false); } } return(limit == node.N && !rtl); } // Grouping nodes for which we only care about their single child case RegexNode.Atomic: case RegexNode.Capture: return(Process(node.Child(0), ref vsb)); // Zero-width anchors and assertions case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.ECMABoundary: case RegexNode.NonBoundary: case RegexNode.NonECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: case RegexNode.Empty: case RegexNode.UpdateBumpalong: case RegexNode.Require: case RegexNode.Prevent: return(true); // Give up for anything else default: return(false); } }
/// <summary> /// The top level RegexCode generator. It does a depth-first walk /// through the tree and calls EmitFragment to emit code before /// and after each child of an interior node and at each leaf. /// It also computes various information about the tree, such as /// prefix data to help with optimizations. /// </summary> public RegexCode RegexCodeFromRegexTree(RegexTree tree) { // Construct sparse capnum mapping if some numbers are unused. int capsize; if (tree.CapNumList == null || tree.CapTop == tree.CapNumList.Length) { capsize = tree.CapTop; _caps = null; } else { capsize = tree.CapNumList.Length; _caps = tree.Caps; for (int i = 0; i < tree.CapNumList.Length; i++) { _caps[tree.CapNumList[i]] = i; } } // Every written code begins with a lazy branch. This will be back-patched // to point to the ending Stop after the whole expression has been written. Emit(RegexCode.Lazybranch, 0); // Emit every node. RegexNode curNode = tree.Root; int curChild = 0; while (true) { int curNodeChildCount = curNode.ChildCount(); if (curNodeChildCount == 0) { EmitFragment(curNode.Type, curNode, 0); } else if (curChild < curNodeChildCount) { EmitFragment(curNode.Type | BeforeChild, curNode, curChild); curNode = curNode.Child(curChild); _intStack.Append(curChild); curChild = 0; continue; } if (_intStack.Length == 0) { break; } curChild = _intStack.Pop(); curNode = curNode.Next !; EmitFragment(curNode.Type | AfterChild, curNode, curChild); curChild++; } // Patch the starting Lazybranch, emit the final Stop, and get the resulting code array. PatchJump(0, _emitted.Length); Emit(RegexCode.Stop); int[] emitted = _emitted.AsSpan().ToArray(); bool rtl = (tree.Options & RegexOptions.RightToLeft) != 0; bool compiled = (tree.Options & RegexOptions.Compiled) != 0; // Compute prefixes to help optimize FindFirstChar. RegexBoyerMoore?boyerMoorePrefix = null; (string CharClass, bool CaseInsensitive)[]? leadingCharClasses = null;
internal static RegexPrefix Prefix(RegexTree tree) { RegexNode node2 = null; int num2; int num = 0; RegexNode node = tree._root; Label_000B: num2 = node._type; switch (num2) { case 3: case 6: case 12: num2 = node._type; switch (num2) { case 3: case 6: if (node._m > 0) { StringBuilder builder = new StringBuilder(); builder.Append(node._ch, node._m); return(new RegexPrefix(builder.ToString(), RegexOptions.None != (node._options & RegexOptions.IgnoreCase))); } goto Label_0151; } if (num2 != 12) { goto Label_0151; } return(new RegexPrefix(node._str, RegexOptions.None != (node._options & RegexOptions.IgnoreCase))); case 14: case 15: case 0x10: case 0x12: case 0x13: case 20: case 0x15: case 0x17: case 30: case 0x1f: case 0x29: break; case 0x19: if (node.ChildCount() > 0) { node2 = node; num = 0; } break; case 0x1c: case 0x20: node = node.Child(0); node2 = null; goto Label_000B; default: return(RegexPrefix.Empty); } if ((node2 == null) || (num >= node2.ChildCount())) { return(RegexPrefix.Empty); } node = node2.Child(num++); goto Label_000B; Label_0151: return(RegexPrefix.Empty); }
static bool TryAnalyze(RegexNode node, AnalysisResults results, bool isAtomicByAncestor, bool isInLoop) { if (!StackHelper.TryEnsureSufficientExecutionStack()) { return(false); } // Track whether we've seen any nodes with various options set. results._hasIgnoreCase |= (node.Options & RegexOptions.IgnoreCase) != 0; results._hasRightToLeft |= (node.Options & RegexOptions.RightToLeft) != 0; // Track whether this node is inside of a loop. if (isInLoop) { (results._inLoops ??= new HashSet <RegexNode>()).Add(node); } if (isAtomicByAncestor) { // We've been told by our parent that we should be considered atomic, so add ourselves // to the atomic collection. results._isAtomicByAncestor.Add(node); } else { // Certain kinds of nodes incur backtracking logic themselves: add them to the backtracking collection. // We may later find that a node contains another that has backtracking; we'll add nodes based on that // after examining the children. switch (node.Kind) { case RegexNodeKind.Alternate: case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when node.M != node.N: case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop or RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy when node.M != node.N: (results._mayBacktrack ??= new HashSet <RegexNode>()).Add(node); break; } } // Update state for certain node types. bool isAtomicBySelf = false; switch (node.Kind) { // Some node types add atomicity around what they wrap. Set isAtomicBySelfOrParent to true for such nodes // even if it was false upon entering the method. case RegexNodeKind.Atomic: case RegexNodeKind.NegativeLookaround: case RegexNodeKind.PositiveLookaround: isAtomicBySelf = true; break; // Track any nodes that are themselves captures. case RegexNodeKind.Capture: results._containsCapture.Add(node); break; // Track whether we've recurred into a loop case RegexNodeKind.Loop: case RegexNodeKind.Lazyloop: isInLoop = true; break; } // Process each child. int childCount = node.ChildCount(); for (int i = 0; i < childCount; i++) { RegexNode child = node.Child(i); // Determine whether the child should be treated as atomic (whether anything // can backtrack into it), which is influenced by whether this node (the child's // parent) is considered atomic by itself or by its parent. bool treatChildAsAtomic = (isAtomicByAncestor | isAtomicBySelf) && node.Kind switch { // If the parent is atomic, so is the child. That's the whole purpose // of the Atomic node, and lookarounds are also implicitly atomic. RegexNodeKind.Atomic or RegexNodeKind.NegativeLookaround or RegexNodeKind.PositiveLookaround => true, // Each branch is considered independently, so any atomicity applied to the alternation also applies // to each individual branch. This is true as well for conditionals. RegexNodeKind.Alternate or RegexNodeKind.BackreferenceConditional or RegexNodeKind.ExpressionConditional => true, // Captures don't impact atomicity: if the parent of a capture is atomic, the capture is also atomic. RegexNodeKind.Capture => true, // If the parent is a concatenation and this is the last node, any atomicity // applying to the concatenation applies to this node, too. RegexNodeKind.Concatenate => i == childCount - 1, // For loops with a max iteration count of 1, they themselves can be considered // atomic as can whatever they wrap, as they won't ever iterate more than once // and thus we don't need to worry about one iteration consuming input destined // for a subsequent iteration. RegexNodeKind.Loop or RegexNodeKind.Lazyloop when node.N == 1 => true, // For any other parent type, give up on trying to prove atomicity. _ => false, }; // Now analyze the child. if (!TryAnalyze(child, results, treatChildAsAtomic, isInLoop)) { return(false); } // If the child contains captures, so too does this parent. if (results._containsCapture.Contains(child)) { results._containsCapture.Add(node); } // If the child might require backtracking into it, so too might the parent, // unless the parent is itself considered atomic. Here we don't consider parental // atomicity, as we need to surface upwards to the parent whether any backtracking // will be visible from this node to it. if (!isAtomicBySelf && (results._mayBacktrack?.Contains(child) == true)) { (results._mayBacktrack ??= new HashSet <RegexNode>()).Add(node); } } // Successfully analyzed the node. return(true); }
/// <summary>Performs additional optimizations on an entire tree prior to being used.</summary> internal RegexNode FinalOptimize() { RegexNode rootNode = this; Debug.Assert(rootNode.Type == Capture && rootNode.ChildCount() == 1); // If we find backtracking construct at the end of the regex, we can instead make it non-backtracking, // since nothing would ever backtrack into it anyway. Doing this then makes the construct available // to implementations that don't support backtracking. if ((Options & RegexOptions.RightToLeft) == 0 && // only apply optimization when LTR to avoid needing additional code for the rarer RTL case (Options & RegexOptions.Compiled) != 0) // only apply when we're compiling, as that's the only time it would make a meaningful difference { // Walk the tree, starting from the sole child of the root implicit capture. RegexNode node = rootNode.Child(0); while (true) { switch (node.Type) { case Oneloop: node.Type = Oneloopatomic; break; case Notoneloop: node.Type = Notoneloopatomic; break; case Setloop: node.Type = Setloopatomic; break; case Capture: case Concatenate: RegexNode existingChild = node.Child(node.ChildCount() - 1); switch (existingChild.Type) { default: node = existingChild; break; case Alternate: case Loop: case Lazyloop: var atomic = new RegexNode(Atomic, Options); atomic.AddChild(existingChild); node.ReplaceChild(node.ChildCount() - 1, atomic); break; } continue; case Atomic: node = node.Child(0); continue; } break; } } // If the root node under the implicit Capture is an Atomic, the Atomic is useless as there's nothing // to backtrack into it, so we can remove it. if (rootNode.Child(0).Type == Atomic) { rootNode.ReplaceChild(0, rootNode.Child(0).Child(0)); } // Done optimizing. Return the final tree. return(rootNode); }