private RegexNode CreatePseudoCircumflexNode() { // Create a lookbehind, e.g.: (?<=(?>\A|\r\n|[\r\n])) var options = _options | RegexOptions.RightToLeft; var newlineTypes = GetNewlineTypes() | NewlineTypes.DocumentStart; var lineCheckNode = CreateNewLineParseNode(newlineTypes, options); var lookaheadNode = new RegexNode(RegexNode.Require, options); lookaheadNode.AddChild(lineCheckNode); return(lookaheadNode); }
internal void AddChild(RegexNode newChild) { RegexNode reducedChild; if (_children == null) { _children = new List <RegexNode>(4); } reducedChild = newChild.Reduce(); _children.Add(reducedChild); reducedChild._next = this; }
private RegexNode CreatePseudoDollarNode() { if (!UseOptionM()) { Debug.Assert(!UseOptionDollarEndOnly()); // Could have been handled by a simple \z return(CreatePseudoEndZNode()); } // Create a lookahead, e.g.: (?=(?>\z|\r\n|[\r\n])) var options = _options & ~RegexOptions.RightToLeft; var newlineTypes = GetNewlineTypes() | NewlineTypes.DocumentEnd; var lineCheckNode = CreateNewLineParseNode(newlineTypes, options); var lookaheadNode = new RegexNode(RegexNode.Require, options); lookaheadNode.AddChild(lineCheckNode); return(lookaheadNode); }
private RegexNode CreatePseudoEndZNode() { // Create a lookahead, e.g.: (?=(?>\r\n|[\r\n])?\z) var options = _options & ~RegexOptions.RightToLeft; var lineCheckNode = CreateNewLineParseNode(GetNewlineTypes(), options); var maybeNode = new RegexNode(RegexNode.Loop, options, 0, 1); maybeNode.AddChild(lineCheckNode); var concatNode = new RegexNode(RegexNode.Concatenate, options); concatNode.AddChild(maybeNode); concatNode.AddChild(new RegexNode(RegexNode.End, options)); var lookaheadNode = new RegexNode(RegexNode.Require, options); lookaheadNode.AddChild(concatNode); return(lookaheadNode); }
/// <summary> /// The top level RegexCode generator. It does a depth-first walk /// through the tree and calls EmitFragment to emits code before /// and after each child of an interior node, and at each leaf. /// </summary> public RegexCode RegexCodeFromRegexTree(RegexTree tree) { // construct sparse capnum mapping if some numbers are unused int capsize; if (tree.CapNumList == null || tree.CapTop == tree.CapNumList.Length) { capsize = tree.CapTop; _caps = null; } else { capsize = tree.CapNumList.Length; _caps = tree.Caps; for (int i = 0; i < tree.CapNumList.Length; i++) { _caps[tree.CapNumList[i]] = i; } } _capPositions = new int[capsize]; RegexNode curNode = tree.Root; int curChild = 0; Emit(RegexCode.Lazybranch, 0); while (true) { if (curNode.Children == null) { EmitFragment(curNode.NType, curNode, 0); } else if (curChild < curNode.Children.Count) { EmitFragment(curNode.NType | BeforeChild, curNode, curChild); curNode = curNode.Children[curChild]; _intStack.Append(curChild); curChild = 0; continue; } if (_intStack.Length == 0) { break; } curChild = _intStack.Pop(); curNode = curNode.Next; EmitFragment(curNode.NType | AfterChild, curNode, curChild); curChild++; } PatchJump(0, _emitted.Length); Emit(RegexCode.Stop); RegexPrefix fcPrefix = RegexFCD.FirstChars(tree); RegexPrefix prefix = RegexFCD.Prefix(tree); bool rtl = ((tree.Options & RegexOptions.RightToLeft) != 0); CultureInfo culture = (tree.Options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; RegexBoyerMoore bmPrefix; if (prefix.Prefix.Length > 0) { bmPrefix = new RegexBoyerMoore(prefix.Prefix, prefix.CaseInsensitive, rtl, culture); } else { bmPrefix = null; } int anchors = RegexFCD.Anchors(tree); int[] emitted = _emitted.AsSpan().ToArray(); return(new RegexCode(emitted, _stringTable, _trackCount, _caps, capsize, bmPrefix, fcPrefix, anchors, rtl, _resetMatchStartFound, _capPositions)); }
/// <summary> /// The main RegexCode generator. It does a depth-first walk /// through the tree and calls EmitFragment to emits code before /// and after each child of an interior node, and at each leaf. /// </summary> private void EmitFragment(int nodetype, RegexNode node, int curIndex) { int bits = 0; if (nodetype <= RegexNode.Ref) { if (node.UseOptionR()) { bits |= RegexCode.Rtl; } if ((node.Options & RegexOptions.IgnoreCase) != 0) { bits |= RegexCode.Ci; } } switch (nodetype) { case RegexNode.Concatenate | BeforeChild: case RegexNode.Concatenate | AfterChild: case RegexNode.Empty: break; case RegexNode.Alternate | BeforeChild: if (curIndex < node.Children.Count - 1) { _intStack.Append(_emitted.Length); Emit(RegexCode.Lazybranch, 0); } break; case RegexNode.Alternate | AfterChild: { if (curIndex < node.Children.Count - 1) { int LBPos = _intStack.Pop(); _intStack.Append(_emitted.Length); Emit(RegexCode.Goto, 0); PatchJump(LBPos, _emitted.Length); } else { int I; for (I = 0; I < curIndex; I++) { PatchJump(_intStack.Pop(), _emitted.Length); } } break; } case RegexNode.Testref | BeforeChild: switch (curIndex) { case 0: Emit(RegexCode.Setjump); _intStack.Append(_emitted.Length); Emit(RegexCode.Lazybranch, 0); Emit(RegexCode.Testref, MapCapnum(node.M)); Emit(RegexCode.Forejump); break; } break; case RegexNode.Testref | AfterChild: switch (curIndex) { case 0: { int Branchpos = _intStack.Pop(); _intStack.Append(_emitted.Length); Emit(RegexCode.Goto, 0); PatchJump(Branchpos, _emitted.Length); Emit(RegexCode.Forejump); if (node.Children.Count > 1) { break; } // else fallthrough goto case 1; } case 1: PatchJump(_intStack.Pop(), _emitted.Length); break; } break; case RegexNode.Testgroup | BeforeChild: switch (curIndex) { case 0: Emit(RegexCode.Setjump); Emit(RegexCode.Setmark); _intStack.Append(_emitted.Length); Emit(RegexCode.Lazybranch, 0); break; } break; case RegexNode.Testgroup | AfterChild: switch (curIndex) { case 0: Emit(RegexCode.Getmark); Emit(RegexCode.Forejump); break; case 1: int Branchpos = _intStack.Pop(); _intStack.Append(_emitted.Length); Emit(RegexCode.Goto, 0); PatchJump(Branchpos, _emitted.Length); Emit(RegexCode.Getmark); Emit(RegexCode.Forejump); if (node.Children.Count > 2) { break; } // else fallthrough goto case 2; case 2: PatchJump(_intStack.Pop(), _emitted.Length); break; } break; case RegexNode.Loop | BeforeChild: case RegexNode.Lazyloop | BeforeChild: if (node.N < int.MaxValue || node.M > 1) { Emit(node.M == 0 ? RegexCode.Nullcount : RegexCode.Setcount, node.M == 0 ? 0 : 1 - node.M); } else { Emit(node.M == 0 ? RegexCode.Nullmark : RegexCode.Setmark); } if (node.M == 0) { _intStack.Append(_emitted.Length); Emit(RegexCode.Goto, 0); } _intStack.Append(_emitted.Length); break; case RegexNode.Loop | AfterChild: case RegexNode.Lazyloop | AfterChild: { int StartJumpPos = _emitted.Length; int Lazy = (nodetype - (RegexNode.Loop | AfterChild)); if (node.N < int.MaxValue || node.M > 1) { Emit(RegexCode.Branchcount + Lazy, _intStack.Pop(), node.N == int.MaxValue ? int.MaxValue : node.N - node.M); } else { Emit(RegexCode.Branchmark + Lazy, _intStack.Pop()); } if (node.M == 0) { PatchJump(_intStack.Pop(), StartJumpPos); } } break; case RegexNode.Group | BeforeChild: case RegexNode.Group | AfterChild: break; case RegexNode.Capture | BeforeChild: { int mappedCapnum = MapCapnum(node.M); if (_capPositions[mappedCapnum] == default) // Note only the first one in the case of a branch reset group { _capPositions[mappedCapnum] = _emitted.Length; // Note that this capture group starts here } Emit(RegexCode.Setmark); break; } case RegexNode.Capture | AfterChild: Emit(RegexCode.Capturemark, MapCapnum(node.M), MapCapnum(node.N)); break; case RegexNode.Require | BeforeChild: // NOTE: the following line causes lookahead/lookbehind to be // NON-BACKTRACKING. It can be commented out with (*) Emit(RegexCode.Setjump); Emit(RegexCode.Setmark); break; case RegexNode.Require | AfterChild: Emit(RegexCode.Getmark); // NOTE: the following line causes lookahead/lookbehind to be // NON-BACKTRACKING. It can be commented out with (*) Emit(RegexCode.Forejump); break; case RegexNode.Prevent | BeforeChild: Emit(RegexCode.Setjump); _intStack.Append(_emitted.Length); Emit(RegexCode.Lazybranch, 0); break; case RegexNode.Prevent | AfterChild: Emit(RegexCode.Backjump); PatchJump(_intStack.Pop(), _emitted.Length); Emit(RegexCode.Forejump); break; case RegexNode.Greedy | BeforeChild: Emit(RegexCode.Setjump); break; case RegexNode.Greedy | AfterChild: Emit(RegexCode.Forejump); break; case RegexNode.One: case RegexNode.Notone: Emit(node.NType | bits, node.Ch); break; case RegexNode.Notoneloop: case RegexNode.Notonelazy: case RegexNode.Oneloop: case RegexNode.Onelazy: if (node.M > 0) { Emit(((node.NType == RegexNode.Oneloop || node.NType == RegexNode.Onelazy) ? RegexCode.Onerep : RegexCode.Notonerep) | bits, node.Ch, node.M); } if (node.N > node.M) { Emit(node.NType | bits, node.Ch, node.N == int.MaxValue ? int.MaxValue : node.N - node.M); } break; case RegexNode.Setloop: case RegexNode.Setlazy: if (node.M > 0) { Emit(RegexCode.Setrep | bits, StringCode(node.Str), node.M); } if (node.N > node.M) { Emit(node.NType | bits, StringCode(node.Str), (node.N == int.MaxValue) ? int.MaxValue : node.N - node.M); } break; case RegexNode.Multi: Emit(node.NType | bits, StringCode(node.Str)); break; case RegexNode.Set: Emit(node.NType | bits, StringCode(node.Str)); break; case RegexNode.Ref: Emit(node.NType | bits, MapCapnum(node.M)); break; case RegexNode.Nothing: case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.Nonboundary: case RegexNode.ECMABoundary: case RegexNode.NonECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: Emit(node.NType); break; case RegexNode.ResetMatchStart: _resetMatchStartFound = true; Emit(node.NType); break; case RegexNode.CallSubroutine: Emit(RegexCode.CallSubroutine, MapCapnum(node.M)); break; case RegexNode.BacktrackingVerb: Emit(node.M); break; default: throw new ArgumentException(string.Format(SR.UnexpectedOpcode, nodetype.ToString())); } }
/// <summary> /// The main RegexCode generator. It does a depth-first walk /// through the tree and calls EmitFragment to emits code before /// and after each child of an interior node, and at each leaf. /// </summary> private void EmitFragment(int nodetype, RegexNode node, int curIndex) { int bits = 0; if (nodetype <= RegexNode.Ref) { if (node.UseOptionR()) { bits |= RegexCode.Rtl; } if ((node._options & RegexOptions.IgnoreCase) != 0) { bits |= RegexCode.Ci; } } switch (nodetype) { case RegexNode.Concatenate | BeforeChild: case RegexNode.Concatenate | AfterChild: case RegexNode.Empty: break; case RegexNode.Alternate | BeforeChild: if (curIndex < node._children.Count - 1) { PushInt(CurPos()); Emit(RegexCode.Lazybranch, 0); } break; case RegexNode.Alternate | AfterChild: { if (curIndex < node._children.Count - 1) { int LBPos = PopInt(); PushInt(CurPos()); Emit(RegexCode.Goto, 0); PatchJump(LBPos, CurPos()); } else { int I; for (I = 0; I < curIndex; I++) { PatchJump(PopInt(), CurPos()); } } break; } case RegexNode.Testref | BeforeChild: switch (curIndex) { case 0: Emit(RegexCode.Setjump); PushInt(CurPos()); Emit(RegexCode.Lazybranch, 0); Emit(RegexCode.Testref, MapCapnum(node._m)); Emit(RegexCode.Forejump); break; } break; case RegexNode.Testref | AfterChild: switch (curIndex) { case 0: { int Branchpos = PopInt(); PushInt(CurPos()); Emit(RegexCode.Goto, 0); PatchJump(Branchpos, CurPos()); Emit(RegexCode.Forejump); if (node._children.Count > 1) { break; } // else fallthrough goto case 1; } case 1: PatchJump(PopInt(), CurPos()); break; } break; case RegexNode.Testgroup | BeforeChild: switch (curIndex) { case 0: Emit(RegexCode.Setjump); Emit(RegexCode.Setmark); PushInt(CurPos()); Emit(RegexCode.Lazybranch, 0); break; } break; case RegexNode.Testgroup | AfterChild: switch (curIndex) { case 0: Emit(RegexCode.Getmark); Emit(RegexCode.Forejump); break; case 1: int Branchpos = PopInt(); PushInt(CurPos()); Emit(RegexCode.Goto, 0); PatchJump(Branchpos, CurPos()); Emit(RegexCode.Getmark); Emit(RegexCode.Forejump); if (node._children.Count > 2) { break; } // else fallthrough goto case 2; case 2: PatchJump(PopInt(), CurPos()); break; } break; case RegexNode.Loop | BeforeChild: case RegexNode.Lazyloop | BeforeChild: if (node._n < int.MaxValue || node._m > 1) { Emit(node._m == 0 ? RegexCode.Nullcount : RegexCode.Setcount, node._m == 0 ? 0 : 1 - node._m); } else { Emit(node._m == 0 ? RegexCode.Nullmark : RegexCode.Setmark); } if (node._m == 0) { PushInt(CurPos()); Emit(RegexCode.Goto, 0); } PushInt(CurPos()); break; case RegexNode.Loop | AfterChild: case RegexNode.Lazyloop | AfterChild: { int StartJumpPos = CurPos(); int Lazy = (nodetype - (RegexNode.Loop | AfterChild)); if (node._n < int.MaxValue || node._m > 1) { Emit(RegexCode.Branchcount + Lazy, PopInt(), node._n == int.MaxValue ? int.MaxValue : node._n - node._m); } else { Emit(RegexCode.Branchmark + Lazy, PopInt()); } if (node._m == 0) { PatchJump(PopInt(), StartJumpPos); } } break; case RegexNode.Group | BeforeChild: case RegexNode.Group | AfterChild: break; case RegexNode.Capture | BeforeChild: _capPositions[MapCapnum(node._m)] = _curpos; // Note that this capture group starts here Emit(RegexCode.Setmark); break; case RegexNode.Capture | AfterChild: Emit(RegexCode.Capturemark, MapCapnum(node._m), MapCapnum(node._n)); break; case RegexNode.Require | BeforeChild: // NOTE: the following line causes lookahead/lookbehind to be // NON-BACKTRACKING. It can be commented out with (*) Emit(RegexCode.Setjump); Emit(RegexCode.Setmark); break; case RegexNode.Require | AfterChild: Emit(RegexCode.Getmark); // NOTE: the following line causes lookahead/lookbehind to be // NON-BACKTRACKING. It can be commented out with (*) Emit(RegexCode.Forejump); break; case RegexNode.Prevent | BeforeChild: Emit(RegexCode.Setjump); PushInt(CurPos()); Emit(RegexCode.Lazybranch, 0); break; case RegexNode.Prevent | AfterChild: Emit(RegexCode.Backjump); PatchJump(PopInt(), CurPos()); Emit(RegexCode.Forejump); break; case RegexNode.Greedy | BeforeChild: Emit(RegexCode.Setjump); break; case RegexNode.Greedy | AfterChild: Emit(RegexCode.Forejump); break; case RegexNode.One: case RegexNode.Notone: Emit(node._type | bits, node._ch); break; case RegexNode.Notoneloop: case RegexNode.Notonelazy: case RegexNode.Oneloop: case RegexNode.Onelazy: if (node._m > 0) { Emit(((node._type == RegexNode.Oneloop || node._type == RegexNode.Onelazy) ? RegexCode.Onerep : RegexCode.Notonerep) | bits, node._ch, node._m); } if (node._n > node._m) { Emit(node._type | bits, node._ch, node._n == int.MaxValue ? int.MaxValue : node._n - node._m); } break; case RegexNode.Setloop: case RegexNode.Setlazy: if (node._m > 0) { Emit(RegexCode.Setrep | bits, StringCode(node._str), node._m); } if (node._n > node._m) { Emit(node._type | bits, StringCode(node._str), (node._n == int.MaxValue) ? int.MaxValue : node._n - node._m); } break; case RegexNode.Multi: Emit(node._type | bits, StringCode(node._str)); break; case RegexNode.Set: Emit(node._type | bits, StringCode(node._str)); break; case RegexNode.Ref: Emit(node._type | bits, MapCapnum(node._m)); break; case RegexNode.Nothing: case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.Nonboundary: case RegexNode.ECMABoundary: case RegexNode.NonECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: Emit(node._type); break; case RegexNode.ResetMatchStart: _resetMatchStartFound = true; Emit(node._type); break; case RegexNode.CallSubroutine: Emit(RegexCode.CallSubroutine, MapCapnum(node._m)); break; default: throw new ArgumentException(SR.Format(SR.UnexpectedOpcode, nodetype.ToString(CultureInfo.CurrentCulture))); } }
/* * This is a related computation: it takes a RegexTree and computes the * leading substring if it see one. It's quite trivial and gives up easily. */ internal static RegexPrefix Prefix(RegexTree tree) { RegexNode curNode; RegexNode concatNode = null; int nextChild = 0; curNode = tree._root; for (; ;) { switch (curNode._type) { case RegexNode.Concatenate: if (curNode.ChildCount() > 0) { concatNode = curNode; nextChild = 0; } break; case RegexNode.Greedy: case RegexNode.Capture: curNode = curNode.Child(0); concatNode = null; continue; case RegexNode.Oneloop: case RegexNode.Onelazy: if (curNode._m > 0) { string pref = string.Empty.PadRight(curNode._m, curNode._ch); return(new RegexPrefix(pref, 0 != (curNode._options & RegexOptions.IgnoreCase))); } else { return(RegexPrefix.Empty); } case RegexNode.One: return(new RegexPrefix(curNode._ch.ToString(), 0 != (curNode._options & RegexOptions.IgnoreCase))); case RegexNode.Multi: return(new RegexPrefix(curNode._str, 0 != (curNode._options & RegexOptions.IgnoreCase))); case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.ECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: case RegexNode.Empty: case RegexNode.Require: case RegexNode.Prevent: break; default: return(RegexPrefix.Empty); } if (concatNode == null || nextChild >= concatNode.ChildCount()) { return(RegexPrefix.Empty); } curNode = concatNode.Child(nextChild++); } }
/* * FC computation and shortcut cases for each node type */ private void CalculateFC(int NodeType, RegexNode node, int CurIndex) { bool ci = false; bool rtl = false; if (NodeType <= RegexNode.Ref) { if ((node._options & RegexOptions.IgnoreCase) != 0) { ci = true; } if ((node._options & RegexOptions.RightToLeft) != 0) { rtl = true; } } switch (NodeType) { case RegexNode.Concatenate | BeforeChild: case RegexNode.Alternate | BeforeChild: case RegexNode.Testref | BeforeChild: case RegexNode.Loop | BeforeChild: case RegexNode.Lazyloop | BeforeChild: break; case RegexNode.Testgroup | BeforeChild: if (CurIndex == 0) { SkipChild(); } break; case RegexNode.Empty: PushFC(new RegexFC(true)); break; case RegexNode.Concatenate | AfterChild: if (CurIndex != 0) { RegexFC child = PopFC(); RegexFC cumul = TopFC(); _failed = !cumul.AddFC(child, true); } if (!TopFC()._nullable) { _skipAllChildren = true; } break; case RegexNode.Testgroup | AfterChild: if (CurIndex > 1) { RegexFC child = PopFC(); RegexFC cumul = TopFC(); _failed = !cumul.AddFC(child, false); } break; case RegexNode.Alternate | AfterChild: case RegexNode.Testref | AfterChild: if (CurIndex != 0) { RegexFC child = PopFC(); RegexFC cumul = TopFC(); _failed = !cumul.AddFC(child, false); } break; case RegexNode.Loop | AfterChild: case RegexNode.Lazyloop | AfterChild: if (node._m == 0) { TopFC()._nullable = true; } break; case RegexNode.Group | BeforeChild: case RegexNode.Group | AfterChild: case RegexNode.Capture | BeforeChild: case RegexNode.Capture | AfterChild: case RegexNode.Greedy | BeforeChild: case RegexNode.Greedy | AfterChild: break; case RegexNode.Require | BeforeChild: case RegexNode.Prevent | BeforeChild: SkipChild(); PushFC(new RegexFC(true)); break; case RegexNode.Require | AfterChild: case RegexNode.Prevent | AfterChild: break; case RegexNode.One: case RegexNode.Notone: PushFC(new RegexFC(node._ch, NodeType == RegexNode.Notone, false, ci)); break; case RegexNode.Oneloop: case RegexNode.Onelazy: PushFC(new RegexFC(node._ch, false, node._m == 0, ci)); break; case RegexNode.Notoneloop: case RegexNode.Notonelazy: PushFC(new RegexFC(node._ch, true, node._m == 0, ci)); break; case RegexNode.Multi: if (node._str.Length == 0) { PushFC(new RegexFC(true)); } else if (!rtl) { PushFC(new RegexFC(node._str[0], false, false, ci)); } else { PushFC(new RegexFC(node._str[node._str.Length - 1], false, false, ci)); } break; case RegexNode.Set: PushFC(new RegexFC(node._str, false, ci)); break; case RegexNode.Setloop: case RegexNode.Setlazy: PushFC(new RegexFC(node._str, node._m == 0, ci)); break; case RegexNode.Ref: case RegexNode.CallSubroutine: PushFC(new RegexFC(RegexCharClass.AnyClass, true, false)); break; case RegexNode.Nothing: case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.Nonboundary: case RegexNode.ECMABoundary: case RegexNode.NonECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: case RegexNode.ResetMatchStart: PushFC(new RegexFC(true)); break; default: throw new ArgumentException(SR.Format(SR.UnexpectedOpcode, NodeType.ToString(CultureInfo.CurrentCulture))); } }