/* calculate the total size of the bitmap required for a class expression */ private static bool calculateBitmapSize(CompilerState state, RENode target, char [] src, int index, int end) { char rangeStart = (char)(0); char c; int n; int nDigits; int i; int max = 0; bool inRange = false; target.bmsize = 0; if (index == end) return true; if (src [index] == '^') ++index; while (index != end) { int localMax = 0; nDigits = 2; switch (src [index]) { case '\\': ++index; c = src [index++]; switch (c) { case 'b': localMax = 0x8; break; case 'f': localMax = 0xC; break; case 'n': localMax = 0xA; break; case 'r': localMax = 0xD; break; case 't': localMax = 0x9; break; case 'v': localMax = 0xB; break; case 'c': if (((index + 1) < end) && char.IsLetter (src [index + 1])) localMax = (char)(src [index++] & 0x1F); else localMax = '\\'; break; case 'u': nDigits += 2; // fall thru... goto case 'x'; case 'x': n = 0; for (i = 0; (i < nDigits) && (index < end); i++) { c = src [index++]; n = ScriptConvert.XDigitToInt (c, n); if (n < 0) { // Back off to accepting the original // '\' as a literal index -= (i + 1); n = '\\'; break; } } localMax = n; break; case 'd': if (inRange) { reportError ("msg.bad.range", ""); return false; } localMax = '9'; break; case 'D': case 's': case 'S': case 'w': case 'W': if (inRange) { reportError ("msg.bad.range", ""); return false; } target.bmsize = 65535; return true; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': /* * This is a non-ECMA extension - decimal escapes (in this * case, octal!) are supposed to be an error inside class * ranges, but supported here for backwards compatibility. * */ n = (c - '0'); c = src [index]; if ('0' <= c && c <= '7') { index++; n = 8 * n + (c - '0'); c = src [index]; if ('0' <= c && c <= '7') { index++; i = 8 * n + (c - '0'); if (i <= 255) n = i; else index--; } } localMax = n; break; default: localMax = c; break; } break; default: localMax = src [index++]; break; } if (inRange) { if (rangeStart > localMax) { reportError ("msg.bad.range", ""); return false; } inRange = false; } else { if (index < (end - 1)) { if (src [index] == '-') { ++index; inRange = true; rangeStart = (char)localMax; continue; } } } if ((state.flags & JSREG_FOLD) != 0) { char cu = upcase ((char)localMax); char cd = downcase ((char)localMax); localMax = (cu >= cd) ? cu : cd; } if (localMax > max) max = localMax; } target.bmsize = max; return true; }
private static bool parseTerm(CompilerState state) { char [] src = state.cpbegin; char c = src [state.cp++]; int nDigits = 2; int parenBaseCount = state.parenCount; int num, tmp; RENode term; int termStart; int ocp = state.cp; switch (c) { /* assertions and atoms */ case '^': state.result = new RENode (REOP_BOL); state.progLength++; return true; case '$': state.result = new RENode (REOP_EOL); state.progLength++; return true; case '\\': if (state.cp < state.cpend) { c = src [state.cp++]; switch (c) { /* assertion escapes */ case 'b': state.result = new RENode (REOP_WBDRY); state.progLength++; return true; case 'B': state.result = new RENode (REOP_WNONBDRY); state.progLength++; return true; /* Decimal escape */ case '0': // Under 'strict' ECMA 3, we interpret \0 as NUL and don't accept octal. // However (and since Rhino doesn't have a 'strict' mode) we'll just // behave the old way for compatibility reasons. // (see http://bugzilla.mozilla.org/show_bug.cgi?id=141078) // TODO: Use strict mode /* octal escape */ num = 0; while (state.cp < state.cpend) { c = src [state.cp]; if ((c >= '0') && (c <= '7')) { state.cp++; tmp = 8 * num + (c - '0'); if (tmp > 255) break; num = tmp; } else break; } c = (char)(num); doFlat (state, c); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': termStart = state.cp - 1; num = getDecimalValue (c, state, 0xFFFF, "msg.overlarge.backref"); /* * n > 9 and > count of parentheses, * then treat as octal instead. */ if ((num > 9) && (num > state.parenCount)) { state.cp = termStart; num = 0; while (state.cp < state.cpend) { c = src [state.cp]; if ((c >= '0') && (c <= '7')) { state.cp++; tmp = 8 * num + (c - '0'); if (tmp > 255) break; num = tmp; } else break; } c = (char)(num); doFlat (state, c); break; } /* otherwise, it's a back-reference */ state.result = new RENode (REOP_BACKREF); state.result.parenIndex = num - 1; state.progLength += 3; break; /* Control escape */ case 'f': c = (char)(0xC); doFlat (state, c); break; case 'n': c = (char)(0xA); doFlat (state, c); break; case 'r': c = (char)(0xD); doFlat (state, c); break; case 't': c = (char)(0x9); doFlat (state, c); break; case 'v': c = (char)(0xB); doFlat (state, c); break; /* Control letter */ case 'c': if ((state.cp < state.cpend) && char.IsLetter (src [state.cp])) c = (char)(src [state.cp++] & 0x1F); else { /* back off to accepting the original '\' as a literal */ --state.cp; c = '\\'; } doFlat (state, c); break; /* UnicodeEscapeSequence */ case 'u': nDigits += 2; // fall thru... /* HexEscapeSequence */ goto case 'x'; case 'x': { int n = 0; int i; for (i = 0; (i < nDigits) && (state.cp < state.cpend); i++) { c = src [state.cp++]; n = ScriptConvert.XDigitToInt (c, n); if (n < 0) { // Back off to accepting the original // 'u' or 'x' as a literal state.cp -= (i + 2); n = src [state.cp++]; break; } } c = (char)(n); } doFlat (state, c); break; /* Character class escapes */ case 'd': state.result = new RENode (REOP_DIGIT); state.progLength++; break; case 'D': state.result = new RENode (REOP_NONDIGIT); state.progLength++; break; case 's': state.result = new RENode (REOP_SPACE); state.progLength++; break; case 'S': state.result = new RENode (REOP_NONSPACE); state.progLength++; break; case 'w': state.result = new RENode (REOP_ALNUM); state.progLength++; break; case 'W': state.result = new RENode (REOP_NONALNUM); state.progLength++; break; /* IdentityEscape */ default: state.result = new RENode (REOP_FLAT); state.result.chr = c; state.result.length = 1; state.result.flatIndex = state.cp - 1; state.progLength += 3; break; } break; } else { /* a trailing '\' is an error */ reportError ("msg.trail.backslash", ""); return false; } case '(': { RENode result = null; termStart = state.cp; if (state.cp + 1 < state.cpend && src [state.cp] == '?' && ((c = src [state.cp + 1]) == '=' || c == '!' || c == ':')) { state.cp += 2; if (c == '=') { result = new RENode (REOP_ASSERT); /* ASSERT, <next>, ... ASSERTTEST */ state.progLength += 4; } else if (c == '!') { result = new RENode (REOP_ASSERT_NOT); /* ASSERTNOT, <next>, ... ASSERTNOTTEST */ state.progLength += 4; } } else { result = new RENode (REOP_LPAREN); /* LPAREN, <index>, ... RPAREN, <index> */ state.progLength += 6; result.parenIndex = state.parenCount++; } ++state.parenNesting; if (!parseDisjunction (state)) return false; if (state.cp == state.cpend || src [state.cp] != ')') { reportError ("msg.unterm.paren", ""); return false; } ++state.cp; --state.parenNesting; if (result != null) { result.kid = state.result; state.result = result; } break; } case ')': reportError ("msg.re.unmatched.right.paren", ""); return false; case '[': state.result = new RENode (REOP_CLASS); termStart = state.cp; state.result.startIndex = termStart; while (true) { if (state.cp >= state.cpend) { reportError ("msg.unterm.class", ""); return false; } if (src [state.cp] == '\\') state.cp++; else { if (src [state.cp] == ']') { state.result.kidlen = state.cp - termStart; break; } } state.cp++; } state.result.index = state.classCount++; /* * Call calculateBitmapSize now as we want any errors it finds * to be reported during the parse phase, not at execution. */ if (!calculateBitmapSize (state, state.result, src, termStart, state.cp++)) return false; state.progLength += 3; /* CLASS, <index> */ break; case '.': state.result = new RENode (REOP_DOT); state.progLength++; break; case '*': case '+': case '?': reportError ("msg.bad.quant", Convert.ToString (src [state.cp - 1])); return false; default: state.result = new RENode (REOP_FLAT); state.result.chr = c; state.result.length = 1; state.result.flatIndex = state.cp - 1; state.progLength += 3; break; } term = state.result; if (state.cp == state.cpend) { return true; } bool hasQ = false; switch (src [state.cp]) { case '+': state.result = new RENode (REOP_QUANT); state.result.min = 1; state.result.max = -1; /* <PLUS>, <parencount>, <parenindex>, <next> ... <ENDCHILD> */ state.progLength += 8; hasQ = true; break; case '*': state.result = new RENode (REOP_QUANT); state.result.min = 0; state.result.max = -1; /* <STAR>, <parencount>, <parenindex>, <next> ... <ENDCHILD> */ state.progLength += 8; hasQ = true; break; case '?': state.result = new RENode (REOP_QUANT); state.result.min = 0; state.result.max = 1; /* <OPT>, <parencount>, <parenindex>, <next> ... <ENDCHILD> */ state.progLength += 8; hasQ = true; break; case '{': /* balance '}' */ { int min = 0; int max = -1; int leftCurl = state.cp; /* For Perl etc. compatibility, if quntifier does not match * \{\d+(,\d*)?\} exactly back off from it * being a quantifier, and chew it up as a literal * atom next time instead. */ c = src [++state.cp]; if (isDigit (c)) { ++state.cp; min = getDecimalValue (c, state, 0xFFFF, "msg.overlarge.min"); c = src [state.cp]; if (c == ',') { c = src [++state.cp]; if (isDigit (c)) { ++state.cp; max = getDecimalValue (c, state, 0xFFFF, "msg.overlarge.max"); c = src [state.cp]; if (min > max) { reportError ("msg.max.lt.min", Convert.ToString (src [state.cp])); return false; } } } else { max = min; } /* balance '{' */ if (c == '}') { state.result = new RENode (REOP_QUANT); state.result.min = min; state.result.max = max; // QUANT, <min>, <max>, <parencount>, // <parenindex>, <next> ... <ENDCHILD> state.progLength += 12; hasQ = true; if (state.cp + 1 != state.cpend) { char nc = src [state.cp + 1]; if (nc == '{') { string quant = string.Empty; for (int i = 2; state.cp + i != state.cpend; i++) { if (src [state.cp + i] == '}') break; quant += src [state.cp + i]; } reportError ("msg.bad.quant", quant); } } } } if (!hasQ) { state.cp = leftCurl; } break; } } if (!hasQ) return true; ++state.cp; state.result.kid = term; state.result.parenIndex = parenBaseCount; state.result.parenCount = state.parenCount - parenBaseCount; if ((state.cp < state.cpend) && (src [state.cp] == '?')) { ++state.cp; state.result.greedy = false; } else state.result.greedy = true; return true; }
internal static object compileRE(string str, string global, bool flat) { RECompiled regexp = new RECompiled (); regexp.source = str.ToCharArray (); int length = str.Length; int flags = 0; if (global != null) { for (int i = 0; i < global.Length; i++) { char c = global [i]; if (c == 'g') { flags |= JSREG_GLOB; } else if (c == 'i') { flags |= JSREG_FOLD; } else if (c == 'm') { flags |= JSREG_MULTILINE; } else { reportError ("msg.invalid.re.flag", Convert.ToString (c)); } } } regexp.flags = flags; CompilerState state = new CompilerState (regexp.source, length, flags); if (flat && length > 0) { if (debug) { System.Console.Out.WriteLine ("flat = \"" + str + "\""); } state.result = new RENode (REOP_FLAT); state.result.chr = state.cpbegin [0]; state.result.length = length; state.result.flatIndex = 0; state.progLength += 5; } else if (!parseDisjunction (state)) return null; regexp.program = new sbyte [state.progLength + 1]; if (state.classCount != 0) { regexp.classList = new RECharSet [state.classCount]; regexp.classCount = state.classCount; } int endPC = emitREBytecode (state, regexp, 0, state.result); regexp.program [endPC++] = REOP_END; if (debug) { System.Console.Out.WriteLine ("Prog. length = " + endPC); for (int i = 0; i < endPC; i++) { System.Console.Out.Write (DebugNameOp ((sbyte)regexp.program [i])); if (i < (endPC - 1)) System.Console.Out.Write (", "); } System.Console.Out.WriteLine (); } regexp.parenCount = state.parenCount; // If re starts with literal, init anchorCh accordingly switch (regexp.program [0]) { case REOP_UCFLAT1: case REOP_UCFLAT1i: regexp.anchorCh = (char)getIndex (regexp.program, 1); break; case REOP_FLAT1: case REOP_FLAT1i: regexp.anchorCh = (char)(regexp.program [1] & 0xFF); break; case REOP_FLAT: case REOP_FLATi: int k = getIndex (regexp.program, 1); regexp.anchorCh = regexp.source [k]; break; } if (debug) { if (regexp.anchorCh >= 0) { System.Console.Out.WriteLine ("Anchor ch = '" + (char)regexp.anchorCh + "'"); } } return regexp; }
/* * Top-down regular expression grammar, based closely on Perl4. * * regexp: altern A regular expression is one or more * altern '|' regexp alternatives separated by vertical bar. */ private static bool parseDisjunction(CompilerState state) { using (Helpers.StackOverflowVerifier sov = new Helpers.StackOverflowVerifier (1024)) { if (!parseAlternative (state)) return false; char [] source = state.cpbegin; int index = state.cp; if (index != source.Length && source [index] == '|') { RENode altResult; ++state.cp; altResult = new RENode (REOP_ALT); altResult.kid = state.result; if (!parseDisjunction (state)) return false; altResult.kid2 = state.result; state.result = altResult; /* ALT, <next>, ..., JUMP, <end> ... JUMP <end> */ state.progLength += 9; } return true; } }
/* * altern: item An alternative is one or more items, * item altern concatenated together. */ private static bool parseAlternative(CompilerState state) { RENode headTerm = null; RENode tailTerm = null; char [] source = state.cpbegin; while (true) { if (state.cp == state.cpend || source [state.cp] == '|' || (state.parenNesting != 0 && source [state.cp] == ')')) { if (headTerm == null) { state.result = new RENode (REOP_EMPTY); } else state.result = headTerm; return true; } if (!parseTerm (state)) return false; if (headTerm == null) headTerm = state.result; else { if (tailTerm == null) { headTerm.next = state.result; tailTerm = state.result; while (tailTerm.next != null) tailTerm = tailTerm.next; } else { tailTerm.next = state.result; tailTerm = tailTerm.next; while (tailTerm.next != null) tailTerm = tailTerm.next; } } } }
private static int getDecimalValue(char c, CompilerState state, int maxValue, string overflowMessageId) { bool overflow = false; int start = state.cp; char [] src = state.cpbegin; int value = c - '0'; for (; state.cp != state.cpend; ++state.cp) { c = src [state.cp]; if (!isDigit (c)) { break; } if (!overflow) { int digit = c - '0'; if (value < (maxValue - digit) / 10) { value = value * 10 + digit; } else { overflow = true; value = maxValue; } } } if (overflow) { reportError (overflowMessageId, new string (src, start, state.cp - start)); } return value; }
private static int emitREBytecode(CompilerState state, RECompiled re, int pc, RENode t) { RENode nextAlt; int nextAltFixup, nextTermFixup; sbyte [] program = re.program; while (t != null) { program [pc++] = t.op; switch (t.op) { case REOP_EMPTY: --pc; break; case REOP_ALT: nextAlt = t.kid2; nextAltFixup = pc; /* address of next alternate */ pc += OFFSET_LEN; pc = emitREBytecode (state, re, pc, t.kid); program [pc++] = REOP_JUMP; nextTermFixup = pc; /* address of following term */ pc += OFFSET_LEN; resolveForwardJump (program, nextAltFixup, pc); pc = emitREBytecode (state, re, pc, nextAlt); program [pc++] = REOP_JUMP; nextAltFixup = pc; pc += OFFSET_LEN; resolveForwardJump (program, nextTermFixup, pc); resolveForwardJump (program, nextAltFixup, pc); break; case REOP_FLAT: /* * Consecutize FLAT's if possible. */ if (t.flatIndex != -1) { while ((t.next != null) && (t.next.op == REOP_FLAT) && ((t.flatIndex + t.length) == t.next.flatIndex)) { t.length += t.next.length; t.next = t.next.next; } } if ((t.flatIndex != -1) && (t.length > 1)) { if ((state.flags & JSREG_FOLD) != 0) program [pc - 1] = REOP_FLATi; else program [pc - 1] = REOP_FLAT; pc = addIndex (program, pc, t.flatIndex); pc = addIndex (program, pc, t.length); } else { if (t.chr < 256) { if ((state.flags & JSREG_FOLD) != 0) program [pc - 1] = REOP_FLAT1i; else program [pc - 1] = REOP_FLAT1; program [pc++] = (sbyte)(t.chr); } else { if ((state.flags & JSREG_FOLD) != 0) program [pc - 1] = REOP_UCFLAT1i; else program [pc - 1] = REOP_UCFLAT1; pc = addIndex (program, pc, t.chr); } } break; case REOP_LPAREN: pc = addIndex (program, pc, t.parenIndex); pc = emitREBytecode (state, re, pc, t.kid); program [pc++] = REOP_RPAREN; pc = addIndex (program, pc, t.parenIndex); break; case REOP_BACKREF: pc = addIndex (program, pc, t.parenIndex); break; case REOP_ASSERT: nextTermFixup = pc; pc += OFFSET_LEN; pc = emitREBytecode (state, re, pc, t.kid); program [pc++] = REOP_ASSERTTEST; resolveForwardJump (program, nextTermFixup, pc); break; case REOP_ASSERT_NOT: nextTermFixup = pc; pc += OFFSET_LEN; pc = emitREBytecode (state, re, pc, t.kid); program [pc++] = REOP_ASSERTNOTTEST; resolveForwardJump (program, nextTermFixup, pc); break; case REOP_QUANT: if ((t.min == 0) && (t.max == -1)) program [pc - 1] = (t.greedy) ? REOP_STAR : REOP_MINIMALSTAR; else if ((t.min == 0) && (t.max == 1)) program [pc - 1] = (t.greedy) ? REOP_OPT : REOP_MINIMALOPT; else if ((t.min == 1) && (t.max == -1)) program [pc - 1] = (t.greedy) ? REOP_PLUS : REOP_MINIMALPLUS; else { if (!t.greedy) program [pc - 1] = REOP_MINIMALQUANT; pc = addIndex (program, pc, t.min); // max can be -1 which addIndex does not accept pc = addIndex (program, pc, t.max + 1); } pc = addIndex (program, pc, t.parenCount); pc = addIndex (program, pc, t.parenIndex); nextTermFixup = pc; pc += OFFSET_LEN; pc = emitREBytecode (state, re, pc, t.kid); program [pc++] = REOP_ENDCHILD; resolveForwardJump (program, nextTermFixup, pc); break; case REOP_CLASS: pc = addIndex (program, pc, t.index); re.classList [t.index] = new RECharSet (t.bmsize, t.startIndex, t.kidlen); break; default: break; } t = t.next; } return pc; }
/* * item: assertion An item is either an assertion or * quantatom a quantified atom. * * assertion: '^' Assertions match beginning of string * (or line if the class static property * RegExp.multiline is true). * '$' End of string (or line if the class * static property RegExp.multiline is * true). * '\b' Word boundary (between \w and \W). * '\B' Word non-boundary. * * quantatom: atom An unquantified atom. * quantatom '{' n ',' m '}' * Atom must occur between n and m times. * quantatom '{' n ',' '}' Atom must occur at least n times. * quantatom '{' n '}' Atom must occur exactly n times. * quantatom '*' Zero or more times (same as {0,}). * quantatom '+' One or more times (same as {1,}). * quantatom '?' Zero or one time (same as {0,1}). * * any of which can be optionally followed by '?' for ungreedy * * atom: '(' regexp ')' A parenthesized regexp (what matched * can be addressed using a backreference, * see '\' n below). * '.' Matches any char except '\n'. * '[' classlist ']' A character class. * '[' '^' classlist ']' A negated character class. * '\f' Form Feed. * '\n' Newline (Line Feed). * '\r' Carriage Return. * '\t' Horizontal Tab. * '\v' Vertical Tab. * '\d' A digit (same as [0-9]). * '\D' A non-digit. * '\w' A word character, [0-9a-z_A-Z]. * '\W' A non-word character. * '\s' A whitespace character, [ \b\f\n\r\t\v]. * '\S' A non-whitespace character. * '\' n A backreference to the nth (n decimal * and positive) parenthesized expression. * '\' octal An octal escape sequence (octal must be * two or three digits long, unless it is * 0 for the null character). * '\x' hex A hex escape (hex must be two digits). * '\c' ctrl A control character, ctrl is a letter. * '\' literalatomchar Any character except one of the above * that follow '\' in an atom. * otheratomchar Any character not first among the other * atom right-hand sides. */ private static void doFlat(CompilerState state, char c) { state.result = new RENode (REOP_FLAT); state.result.chr = c; state.result.length = 1; state.result.flatIndex = -1; state.progLength += 3; }