/* * This is the one of the only two functions that should be called from outside. * It takes a RegexTree and computes the set of chars that can start it. */ internal static RegexPrefix FirstChars(RegexTree t) { RegexFCD s = new RegexFCD(); RegexFC fc = s.RegexFCFromRegexTree(t); if (fc == null || fc._nullable) return null; CultureInfo culture = ((t._options & RegexOptions.CultureInvariant) != 0) ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; return new RegexPrefix(fc.GetFirstChars(culture), fc.IsCaseInsensitive()); }
/// <summary> /// This is the only function that should be called from outside. /// It takes a RegexTree and creates a corresponding RegexCode. /// </summary> internal static RegexCode Write(RegexTree t) { RegexWriter w = new RegexWriter(); RegexCode retval = w.RegexCodeFromRegexTree(t); #if DEBUG if (t.Debug) { t.Dump(); retval.Dump(); } #endif return retval; }
/// <summary> /// The top level RegexCode generator. It does a depth-first walk /// through the tree and calls EmitFragment to emits code before /// and after each child of an interior node, and at each leaf. /// /// It runs two passes, first to count the size of the generated /// code, and second to generate the code. /// /// We should time it against the alternative, which is /// to just generate the code and grow the array as we go. /// </summary> private RegexCode RegexCodeFromRegexTree(RegexTree tree) { RegexNode curNode; int curChild; int capsize; RegexPrefix fcPrefix; RegexPrefix prefix; int anchors; RegexBoyerMoore bmPrefix; bool rtl; // construct sparse capnum mapping if some numbers are unused if (tree._capnumlist == null || tree._captop == tree._capnumlist.Length) { capsize = tree._captop; _caps = null; } else { capsize = tree._capnumlist.Length; _caps = tree._caps; for (int i = 0; i < tree._capnumlist.Length; i++) { _caps[tree._capnumlist[i]] = i; } } _counting = true; for (; ;) { if (!_counting) { _emitted = new int[_count]; } curNode = tree._root; curChild = 0; Emit(RegexCode.Lazybranch, 0); for (; ;) { if (curNode._children == null) { EmitFragment(curNode._type, curNode, 0); } else if (curChild < curNode._children.Count) { EmitFragment(curNode._type | BeforeChild, curNode, curChild); curNode = curNode._children[curChild]; PushInt(curChild); curChild = 0; continue; } if (EmptyStack()) { break; } curChild = PopInt(); curNode = curNode._next; EmitFragment(curNode._type | AfterChild, curNode, curChild); curChild++; } PatchJump(0, CurPos()); Emit(RegexCode.Stop); if (!_counting) { break; } _counting = false; } fcPrefix = RegexFCD.FirstChars(tree); prefix = RegexFCD.Prefix(tree); rtl = ((tree._options & RegexOptions.RightToLeft) != 0); CultureInfo culture = (tree._options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; if (prefix != null && prefix.Prefix.Length > 0) { bmPrefix = new RegexBoyerMoore(prefix.Prefix, prefix.CaseInsensitive, rtl, culture); } else { bmPrefix = null; } anchors = RegexFCD.Anchors(tree); return(new RegexCode(_emitted, _stringtable, _trackcount, _caps, capsize, bmPrefix, fcPrefix, anchors, rtl)); }
/* * This is a related computation: it takes a RegexTree and computes the * leading substring if it see one. It's quite trivial and gives up easily. */ internal static RegexPrefix Prefix(RegexTree tree) { RegexNode curNode; RegexNode concatNode = null; int nextChild = 0; curNode = tree._root; for (; ;) { switch (curNode._type) { case RegexNode.Concatenate: if (curNode.ChildCount() > 0) { concatNode = curNode; nextChild = 0; } break; case RegexNode.Greedy: case RegexNode.Capture: curNode = curNode.Child(0); concatNode = null; continue; case RegexNode.Oneloop: case RegexNode.Onelazy: if (curNode._m > 0) { string pref = string.Empty.PadRight(curNode._m, curNode._ch); return(new RegexPrefix(pref, 0 != (curNode._options & RegexOptions.IgnoreCase))); } else { return(RegexPrefix.Empty); } case RegexNode.One: return(new RegexPrefix(curNode._ch.ToString(), 0 != (curNode._options & RegexOptions.IgnoreCase))); case RegexNode.Multi: return(new RegexPrefix(curNode._str, 0 != (curNode._options & RegexOptions.IgnoreCase))); case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.ECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: case RegexNode.Empty: case RegexNode.Require: case RegexNode.Prevent: break; default: return(RegexPrefix.Empty); } if (concatNode == null || nextChild >= concatNode.ChildCount()) { return(RegexPrefix.Empty); } curNode = concatNode.Child(nextChild++); } }
/* * The main FC computation. It does a shortcutted depth-first walk * through the tree and calls CalculateFC to emits code before * and after each child of an interior node, and at each leaf. */ private RegexFC RegexFCFromRegexTree(RegexTree tree) { RegexNode curNode; int curChild; curNode = tree._root; curChild = 0; for (; ;) { if (curNode._children == null) { // This is a leaf node CalculateFC(curNode._type, curNode, 0); } else if (curChild < curNode._children.Count && !_skipAllChildren) { // This is an interior node, and we have more children to analyze CalculateFC(curNode._type | BeforeChild, curNode, curChild); if (!_skipchild) { curNode = curNode._children[curChild]; // this stack is how we get a depth first walk of the tree. PushInt(curChild); curChild = 0; } else { curChild++; _skipchild = false; } continue; } // This is an interior node where we've finished analyzing all the children, or // the end of a leaf node. _skipAllChildren = false; if (IntIsEmpty()) { break; } curChild = PopInt(); curNode = curNode._next; CalculateFC(curNode._type | AfterChild, curNode, curChild); if (_failed) { return(null); } curChild++; } if (FCIsEmpty()) { return(null); } return(PopFC()); }
/* * This is a related computation: it takes a RegexTree and computes the * leading substring if it see one. It's quite trivial and gives up easily. */ internal static RegexPrefix Prefix(RegexTree tree) { RegexNode curNode; RegexNode concatNode = null; int nextChild = 0; curNode = tree._root; for (; ;) { switch (curNode._type) { case RegexNode.Concatenate: if (curNode.ChildCount() > 0) { concatNode = curNode; nextChild = 0; } break; case RegexNode.Greedy: case RegexNode.Capture: curNode = curNode.Child(0); concatNode = null; continue; case RegexNode.Oneloop: case RegexNode.Onelazy: if (curNode._m > 0) { string pref = string.Empty.PadRight(curNode._m, curNode._ch); return new RegexPrefix(pref, 0 != (curNode._options & RegexOptions.IgnoreCase)); } else return RegexPrefix.Empty; case RegexNode.One: return new RegexPrefix(curNode._ch.ToString(), 0 != (curNode._options & RegexOptions.IgnoreCase)); case RegexNode.Multi: return new RegexPrefix(curNode._str, 0 != (curNode._options & RegexOptions.IgnoreCase)); case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.ECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: case RegexNode.Empty: case RegexNode.Require: case RegexNode.Prevent: break; default: return RegexPrefix.Empty; } if (concatNode == null || nextChild >= concatNode.ChildCount()) return RegexPrefix.Empty; curNode = concatNode.Child(nextChild++); } }
/* * The main FC computation. It does a shortcutted depth-first walk * through the tree and calls CalculateFC to emits code before * and after each child of an interior node, and at each leaf. */ private RegexFC RegexFCFromRegexTree(RegexTree tree) { RegexNode curNode; int curChild; curNode = tree._root; curChild = 0; for (; ;) { if (curNode._children == null) { // This is a leaf node CalculateFC(curNode._type, curNode, 0); } else if (curChild < curNode._children.Count && !_skipAllChildren) { // This is an interior node, and we have more children to analyze CalculateFC(curNode._type | BeforeChild, curNode, curChild); if (!_skipchild) { curNode = curNode._children[curChild]; // this stack is how we get a depth first walk of the tree. PushInt(curChild); curChild = 0; } else { curChild++; _skipchild = false; } continue; } // This is an interior node where we've finished analyzing all the children, or // the end of a leaf node. _skipAllChildren = false; if (IntIsEmpty()) break; curChild = PopInt(); curNode = curNode._next; CalculateFC(curNode._type | AfterChild, curNode, curChild); if (_failed) return null; curChild++; } if (FCIsEmpty()) return null; return PopFC(); }
/* * Yet another related computation: it takes a RegexTree and computes the * leading anchors that it encounters. */ internal static int Anchors(RegexTree tree) { RegexNode curNode; RegexNode concatNode = null; int nextChild = 0; int result = 0; curNode = tree._root; for (; ;) { switch (curNode._type) { case RegexNode.Concatenate: if (curNode.ChildCount() > 0) { concatNode = curNode; nextChild = 0; } break; case RegexNode.Greedy: case RegexNode.Capture: curNode = curNode.Child(0); concatNode = null; continue; case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.ECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: return result | AnchorFromType(curNode._type); case RegexNode.Empty: case RegexNode.Require: case RegexNode.Prevent: break; default: return result; } if (concatNode == null || nextChild >= concatNode.ChildCount()) return result; curNode = concatNode.Child(nextChild++); } }
/// <summary> /// The top level RegexCode generator. It does a depth-first walk /// through the tree and calls EmitFragment to emits code before /// and after each child of an interior node, and at each leaf. /// /// It runs two passes, first to count the size of the generated /// code, and second to generate the code. /// /// We should time it against the alternative, which is /// to just generate the code and grow the array as we go. /// </summary> private RegexCode RegexCodeFromRegexTree(RegexTree tree) { RegexNode curNode; int curChild; int capsize; RegexPrefix fcPrefix; RegexPrefix prefix; int anchors; RegexBoyerMoore bmPrefix; bool rtl; // construct sparse capnum mapping if some numbers are unused if (tree._capnumlist == null || tree._captop == tree._capnumlist.Length) { capsize = tree._captop; _caps = null; } else { capsize = tree._capnumlist.Length; _caps = tree._caps; for (int i = 0; i < tree._capnumlist.Length; i++) _caps[tree._capnumlist[i]] = i; } _counting = true; for (; ;) { if (!_counting) _emitted = new int[_count]; curNode = tree._root; curChild = 0; Emit(RegexCode.Lazybranch, 0); for (; ;) { if (curNode._children == null) { EmitFragment(curNode._type, curNode, 0); } else if (curChild < curNode._children.Count) { EmitFragment(curNode._type | BeforeChild, curNode, curChild); curNode = curNode._children[curChild]; PushInt(curChild); curChild = 0; continue; } if (EmptyStack()) break; curChild = PopInt(); curNode = curNode._next; EmitFragment(curNode._type | AfterChild, curNode, curChild); curChild++; } PatchJump(0, CurPos()); Emit(RegexCode.Stop); if (!_counting) break; _counting = false; } fcPrefix = RegexFCD.FirstChars(tree); prefix = RegexFCD.Prefix(tree); rtl = ((tree._options & RegexOptions.RightToLeft) != 0); CultureInfo culture = (tree._options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; if (prefix != null && prefix.Prefix.Length > 0) bmPrefix = new RegexBoyerMoore(prefix.Prefix, prefix.CaseInsensitive, rtl, culture); else bmPrefix = null; anchors = RegexFCD.Anchors(tree); return new RegexCode(_emitted, _stringtable, _trackcount, _caps, capsize, bmPrefix, fcPrefix, anchors, rtl); }