/// <summary> /// Nested repeaters just get multiplied with each other if they're not /// too lumpy /// </summary> private RegexNode ReduceRep() { RegexNode u = this; RegexNode child; int type = Type(); int min = M; int max = N; for (; ;) { if (u.ChildCount() == 0) { break; } child = u.Child(0); // multiply reps of the same type only if (child.Type() != type) { int childType = child.Type(); if (!(childType >= Oneloop && childType <= Setloop && type == Loop || childType >= Onelazy && childType <= Setlazy && type == Lazyloop)) { break; } } // child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})? // [but things like (a {2,})+ are not too lumpy...] if (u.M == 0 && child.M > 1 || child.N < child.M * 2) { break; } u = child; if (u.M > 0) { u.M = min = ((int.MaxValue - 1) / u.M < min) ? int.MaxValue : u.M * min; } if (u.N > 0) { u.N = max = ((int.MaxValue - 1) / u.N < max) ? int.MaxValue : u.N * max; } } return(min == int.MaxValue ? new RegexNode(Nothing, Options) : u); }
/// <summary> /// This is a related computation: it takes a RegexTree and computes the /// leading substring if it see one. It's quite trivial and gives up easily. /// </summary> public static RegexPrefix Prefix(RegexTree tree) { RegexNode curNode = tree.Root; RegexNode concatNode = null; int nextChild = 0; for (; ;) { switch (curNode.NType) { case RegexNode.Concatenate: if (curNode.ChildCount() > 0) { concatNode = curNode; nextChild = 0; } break; case RegexNode.Greedy: case RegexNode.Capture: curNode = curNode.Child(0); concatNode = null; continue; case RegexNode.Oneloop: case RegexNode.Onelazy: // In release, cutoff at a length to which we can still reasonably construct a string // In debug, use a smaller cutoff to exercise the cutoff path in tests const int Cutoff = #if DEBUG 50; #else 1_000_000; #endif if (curNode.M > 0 && curNode.M < Cutoff) { string pref = string.Empty.PadRight(curNode.M, curNode.Ch); return(new RegexPrefix(pref, 0 != (curNode.Options & RegexOptions.IgnoreCase))); } else { return(RegexPrefix.Empty); } case RegexNode.One: return(new RegexPrefix(curNode.Ch.ToString(), 0 != (curNode.Options & RegexOptions.IgnoreCase))); case RegexNode.Multi: return(new RegexPrefix(curNode.Str, 0 != (curNode.Options & RegexOptions.IgnoreCase))); case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.ECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: case RegexNode.Empty: case RegexNode.Require: case RegexNode.Prevent: break; default: return(RegexPrefix.Empty); } if (concatNode == null || nextChild >= concatNode.ChildCount()) { return(RegexPrefix.Empty); } curNode = concatNode.Child(nextChild++); } }
/// <summary> /// The top level RegexCode generator. It does a depth-first walk /// through the tree and calls EmitFragment to emits code before /// and after each child of an interior node, and at each leaf. /// </summary> public RegexCode RegexCodeFromRegexTree(RegexTree tree) { // construct sparse capnum mapping if some numbers are unused int capsize; if (tree.CapNumList == null || tree.CapTop == tree.CapNumList.Length) { capsize = tree.CapTop; _caps = null; } else { capsize = tree.CapNumList.Length; _caps = tree.Caps; for (int i = 0; i < tree.CapNumList.Length; i++) { _caps[tree.CapNumList[i]] = i; } } RegexNode curNode = tree.Root; int curChild = 0; Emit(RegexCode.Lazybranch, 0); for (; ;) { if (curNode.Children == null) { EmitFragment(curNode.NType, curNode, 0); } else if (curChild < curNode.Children.Count) { EmitFragment(curNode.NType | BeforeChild, curNode, curChild); curNode = curNode.Children[curChild]; _intStack.Push(curChild); curChild = 0; continue; } if (_intStack.Count == 0) { break; } curChild = _intStack.Pop(); curNode = curNode.Next; EmitFragment(curNode.NType | AfterChild, curNode, curChild); curChild++; } PatchJump(0, _emitted.Count); Emit(RegexCode.Stop); RegexPrefix?fcPrefix = RegexFCD.FirstChars(tree); RegexPrefix prefix = RegexFCD.Prefix(tree); bool rtl = ((tree.Options & RegexOptions.RightToLeft) != 0); CultureInfo culture = (tree.Options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; RegexBoyerMoore bmPrefix; if (prefix.Prefix.Length > 0) { bmPrefix = new RegexBoyerMoore(prefix.Prefix, prefix.CaseInsensitive, rtl, culture); } else { bmPrefix = null; } int anchors = RegexFCD.Anchors(tree); int[] emitted = _emitted.ToArray(); return(new RegexCode(emitted, _stringTable, _trackCount, _caps, capsize, bmPrefix, fcPrefix, anchors, rtl)); }
/// <summary> /// FC computation and shortcut cases for each node type /// </summary> private void CalculateFC(int NodeType, RegexNode node, int CurIndex) { bool ci = false; bool rtl = false; if (NodeType <= RegexNode.Ref) { if ((node.Options & RegexOptions.IgnoreCase) != 0) { ci = true; } if ((node.Options & RegexOptions.RightToLeft) != 0) { rtl = true; } } switch (NodeType) { case RegexNode.Concatenate | BeforeChild: case RegexNode.Alternate | BeforeChild: case RegexNode.Testref | BeforeChild: case RegexNode.Loop | BeforeChild: case RegexNode.Lazyloop | BeforeChild: break; case RegexNode.Testgroup | BeforeChild: if (CurIndex == 0) { SkipChild(); } break; case RegexNode.Empty: PushFC(new RegexFC(true)); break; case RegexNode.Concatenate | AfterChild: if (CurIndex != 0) { RegexFC child = PopFC(); RegexFC cumul = TopFC(); _failed = !cumul.AddFC(child, true); } if (!TopFC()._nullable) { _skipAllChildren = true; } break; case RegexNode.Testgroup | AfterChild: if (CurIndex > 1) { RegexFC child = PopFC(); RegexFC cumul = TopFC(); _failed = !cumul.AddFC(child, false); } break; case RegexNode.Alternate | AfterChild: case RegexNode.Testref | AfterChild: if (CurIndex != 0) { RegexFC child = PopFC(); RegexFC cumul = TopFC(); _failed = !cumul.AddFC(child, false); } break; case RegexNode.Loop | AfterChild: case RegexNode.Lazyloop | AfterChild: if (node.M == 0) { TopFC()._nullable = true; } break; case RegexNode.Group | BeforeChild: case RegexNode.Group | AfterChild: case RegexNode.Capture | BeforeChild: case RegexNode.Capture | AfterChild: case RegexNode.Greedy | BeforeChild: case RegexNode.Greedy | AfterChild: break; case RegexNode.Require | BeforeChild: case RegexNode.Prevent | BeforeChild: SkipChild(); PushFC(new RegexFC(true)); break; case RegexNode.Require | AfterChild: case RegexNode.Prevent | AfterChild: break; case RegexNode.One: case RegexNode.Notone: PushFC(new RegexFC(node.Ch, NodeType == RegexNode.Notone, false, ci)); break; case RegexNode.Oneloop: case RegexNode.Onelazy: PushFC(new RegexFC(node.Ch, false, node.M == 0, ci)); break; case RegexNode.Notoneloop: case RegexNode.Notonelazy: PushFC(new RegexFC(node.Ch, true, node.M == 0, ci)); break; case RegexNode.Multi: if (node.Str.Length == 0) { PushFC(new RegexFC(true)); } else if (!rtl) { PushFC(new RegexFC(node.Str[0], false, false, ci)); } else { PushFC(new RegexFC(node.Str[node.Str.Length - 1], false, false, ci)); } break; case RegexNode.Set: PushFC(new RegexFC(node.Str, false, ci)); break; case RegexNode.Setloop: case RegexNode.Setlazy: PushFC(new RegexFC(node.Str, node.M == 0, ci)); break; case RegexNode.Ref: PushFC(new RegexFC(RegexCharClass.AnyClass, true, false)); break; case RegexNode.Nothing: case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.Nonboundary: case RegexNode.ECMABoundary: case RegexNode.NonECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: PushFC(new RegexFC(true)); break; default: throw new ArgumentException($"Unexpected opcode in regular expression generation: {NodeType.ToString(CultureInfo.CurrentCulture)}."); } }
/// <summary> /// The main RegexCode generator. It does a depth-first walk /// through the tree and calls EmitFragment to emits code before /// and after each child of an interior node, and at each leaf. /// </summary> private void EmitFragment(int nodetype, RegexNode node, int curIndex) { int bits = 0; if (nodetype <= RegexNode.Ref) { if (node.UseOptionR()) { bits |= RegexCode.Rtl; } if ((node.Options & RegexOptions.IgnoreCase) != 0) { bits |= RegexCode.Ci; } } switch (nodetype) { case RegexNode.Concatenate | BeforeChild: case RegexNode.Concatenate | AfterChild: case RegexNode.Empty: break; case RegexNode.Alternate | BeforeChild: if (curIndex < node.Children.Count - 1) { _intStack.Push(_emitted.Count); Emit(RegexCode.Lazybranch, 0); } break; case RegexNode.Alternate | AfterChild: { if (curIndex < node.Children.Count - 1) { int LBPos = _intStack.Pop(); _intStack.Push(_emitted.Count); Emit(RegexCode.Goto, 0); PatchJump(LBPos, _emitted.Count); } else { int I; for (I = 0; I < curIndex; I++) { PatchJump(_intStack.Pop(), _emitted.Count); } } break; } case RegexNode.Testref | BeforeChild: switch (curIndex) { case 0: Emit(RegexCode.Setjump); _intStack.Push(_emitted.Count); Emit(RegexCode.Lazybranch, 0); Emit(RegexCode.Testref, MapCapnum(node.M)); Emit(RegexCode.Forejump); break; } break; case RegexNode.Testref | AfterChild: switch (curIndex) { case 0: { int Branchpos = _intStack.Pop(); _intStack.Push(_emitted.Count); Emit(RegexCode.Goto, 0); PatchJump(Branchpos, _emitted.Count); Emit(RegexCode.Forejump); if (node.Children.Count > 1) { break; } // else fallthrough goto case 1; } case 1: PatchJump(_intStack.Pop(), _emitted.Count); break; } break; case RegexNode.Testgroup | BeforeChild: switch (curIndex) { case 0: Emit(RegexCode.Setjump); Emit(RegexCode.Setmark); _intStack.Push(_emitted.Count); Emit(RegexCode.Lazybranch, 0); break; } break; case RegexNode.Testgroup | AfterChild: switch (curIndex) { case 0: Emit(RegexCode.Getmark); Emit(RegexCode.Forejump); break; case 1: int Branchpos = _intStack.Pop(); _intStack.Push(_emitted.Count); Emit(RegexCode.Goto, 0); PatchJump(Branchpos, _emitted.Count); Emit(RegexCode.Getmark); Emit(RegexCode.Forejump); if (node.Children.Count > 2) { break; } // else fallthrough goto case 2; case 2: PatchJump(_intStack.Pop(), _emitted.Count); break; } break; case RegexNode.Loop | BeforeChild: case RegexNode.Lazyloop | BeforeChild: if (node.N < int.MaxValue || node.M > 1) { Emit(node.M == 0 ? RegexCode.Nullcount : RegexCode.Setcount, node.M == 0 ? 0 : 1 - node.M); } else { Emit(node.M == 0 ? RegexCode.Nullmark : RegexCode.Setmark); } if (node.M == 0) { _intStack.Push(_emitted.Count); Emit(RegexCode.Goto, 0); } _intStack.Push(_emitted.Count); break; case RegexNode.Loop | AfterChild: case RegexNode.Lazyloop | AfterChild: { int StartJumpPos = _emitted.Count; int Lazy = (nodetype - (RegexNode.Loop | AfterChild)); if (node.N < int.MaxValue || node.M > 1) { Emit(RegexCode.Branchcount + Lazy, _intStack.Pop(), node.N == int.MaxValue ? int.MaxValue : node.N - node.M); } else { Emit(RegexCode.Branchmark + Lazy, _intStack.Pop()); } if (node.M == 0) { PatchJump(_intStack.Pop(), StartJumpPos); } } break; case RegexNode.Group | BeforeChild: case RegexNode.Group | AfterChild: break; case RegexNode.Capture | BeforeChild: Emit(RegexCode.Setmark); break; case RegexNode.Capture | AfterChild: Emit(RegexCode.Capturemark, MapCapnum(node.M), MapCapnum(node.N)); break; case RegexNode.Require | BeforeChild: // NOTE: the following line causes lookahead/lookbehind to be // NON-BACKTRACKING. It can be commented out with (*) Emit(RegexCode.Setjump); Emit(RegexCode.Setmark); break; case RegexNode.Require | AfterChild: Emit(RegexCode.Getmark); // NOTE: the following line causes lookahead/lookbehind to be // NON-BACKTRACKING. It can be commented out with (*) Emit(RegexCode.Forejump); break; case RegexNode.Prevent | BeforeChild: Emit(RegexCode.Setjump); _intStack.Push(_emitted.Count); Emit(RegexCode.Lazybranch, 0); break; case RegexNode.Prevent | AfterChild: Emit(RegexCode.Backjump); PatchJump(_intStack.Pop(), _emitted.Count); Emit(RegexCode.Forejump); break; case RegexNode.Greedy | BeforeChild: Emit(RegexCode.Setjump); break; case RegexNode.Greedy | AfterChild: Emit(RegexCode.Forejump); break; case RegexNode.One: case RegexNode.Notone: Emit(node.NType | bits, node.Ch); break; case RegexNode.Notoneloop: case RegexNode.Notonelazy: case RegexNode.Oneloop: case RegexNode.Onelazy: if (node.M > 0) { Emit(((node.NType == RegexNode.Oneloop || node.NType == RegexNode.Onelazy) ? RegexCode.Onerep : RegexCode.Notonerep) | bits, node.Ch, node.M); } if (node.N > node.M) { Emit(node.NType | bits, node.Ch, node.N == int.MaxValue ? int.MaxValue : node.N - node.M); } break; case RegexNode.Setloop: case RegexNode.Setlazy: if (node.M > 0) { Emit(RegexCode.Setrep | bits, StringCode(node.Str), node.M); } if (node.N > node.M) { Emit(node.NType | bits, StringCode(node.Str), (node.N == int.MaxValue) ? int.MaxValue : node.N - node.M); } break; case RegexNode.Multi: Emit(node.NType | bits, StringCode(node.Str)); break; case RegexNode.Set: Emit(node.NType | bits, StringCode(node.Str)); break; case RegexNode.Ref: Emit(node.NType | bits, MapCapnum(node.M)); break; case RegexNode.Nothing: case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.Nonboundary: case RegexNode.ECMABoundary: case RegexNode.NonECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: Emit(node.NType); break; default: throw new ArgumentException($"Unexpected opcode in regular expression generation: {nodetype.ToString()}."); } }