/* calculate the total size of the bitmap required for a class expression */ private static bool calculateBitmapSize(CompilerState state, RENode target, char [] src, int index, int end) { char rangeStart = (char)(0); char c; int n; int nDigits; int i; int max = 0; bool inRange = false; target.bmsize = 0; if (index == end) return true; if (src [index] == '^') ++index; while (index != end) { int localMax = 0; nDigits = 2; switch (src [index]) { case '\\': ++index; c = src [index++]; switch (c) { case 'b': localMax = 0x8; break; case 'f': localMax = 0xC; break; case 'n': localMax = 0xA; break; case 'r': localMax = 0xD; break; case 't': localMax = 0x9; break; case 'v': localMax = 0xB; break; case 'c': if (((index + 1) < end) && char.IsLetter (src [index + 1])) localMax = (char)(src [index++] & 0x1F); else localMax = '\\'; break; case 'u': nDigits += 2; // fall thru... goto case 'x'; case 'x': n = 0; for (i = 0; (i < nDigits) && (index < end); i++) { c = src [index++]; n = ScriptConvert.XDigitToInt (c, n); if (n < 0) { // Back off to accepting the original // '\' as a literal index -= (i + 1); n = '\\'; break; } } localMax = n; break; case 'd': if (inRange) { reportError ("msg.bad.range", ""); return false; } localMax = '9'; break; case 'D': case 's': case 'S': case 'w': case 'W': if (inRange) { reportError ("msg.bad.range", ""); return false; } target.bmsize = 65535; return true; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': /* * This is a non-ECMA extension - decimal escapes (in this * case, octal!) are supposed to be an error inside class * ranges, but supported here for backwards compatibility. * */ n = (c - '0'); c = src [index]; if ('0' <= c && c <= '7') { index++; n = 8 * n + (c - '0'); c = src [index]; if ('0' <= c && c <= '7') { index++; i = 8 * n + (c - '0'); if (i <= 255) n = i; else index--; } } localMax = n; break; default: localMax = c; break; } break; default: localMax = src [index++]; break; } if (inRange) { if (rangeStart > localMax) { reportError ("msg.bad.range", ""); return false; } inRange = false; } else { if (index < (end - 1)) { if (src [index] == '-') { ++index; inRange = true; rangeStart = (char)localMax; continue; } } } if ((state.flags & JSREG_FOLD) != 0) { char cu = upcase ((char)localMax); char cd = downcase ((char)localMax); localMax = (cu >= cd) ? cu : cd; } if (localMax > max) max = localMax; } target.bmsize = max; return true; }
/* * Top-down regular expression grammar, based closely on Perl4. * * regexp: altern A regular expression is one or more * altern '|' regexp alternatives separated by vertical bar. */ private static bool parseDisjunction(CompilerState state) { using (Helpers.StackOverflowVerifier sov = new Helpers.StackOverflowVerifier (1024)) { if (!parseAlternative (state)) return false; char [] source = state.cpbegin; int index = state.cp; if (index != source.Length && source [index] == '|') { RENode altResult; ++state.cp; altResult = new RENode (REOP_ALT); altResult.kid = state.result; if (!parseDisjunction (state)) return false; altResult.kid2 = state.result; state.result = altResult; /* ALT, <next>, ..., JUMP, <end> ... JUMP <end> */ state.progLength += 9; } return true; } }
private static bool parseTerm(CompilerState state) { char [] src = state.cpbegin; char c = src [state.cp++]; int nDigits = 2; int parenBaseCount = state.parenCount; int num, tmp; RENode term; int termStart; int ocp = state.cp; switch (c) { /* assertions and atoms */ case '^': state.result = new RENode (REOP_BOL); state.progLength++; return true; case '$': state.result = new RENode (REOP_EOL); state.progLength++; return true; case '\\': if (state.cp < state.cpend) { c = src [state.cp++]; switch (c) { /* assertion escapes */ case 'b': state.result = new RENode (REOP_WBDRY); state.progLength++; return true; case 'B': state.result = new RENode (REOP_WNONBDRY); state.progLength++; return true; /* Decimal escape */ case '0': // Under 'strict' ECMA 3, we interpret \0 as NUL and don't accept octal. // However (and since Rhino doesn't have a 'strict' mode) we'll just // behave the old way for compatibility reasons. // (see http://bugzilla.mozilla.org/show_bug.cgi?id=141078) // TODO: Use strict mode /* octal escape */ num = 0; while (state.cp < state.cpend) { c = src [state.cp]; if ((c >= '0') && (c <= '7')) { state.cp++; tmp = 8 * num + (c - '0'); if (tmp > 255) break; num = tmp; } else break; } c = (char)(num); doFlat (state, c); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': termStart = state.cp - 1; num = getDecimalValue (c, state, 0xFFFF, "msg.overlarge.backref"); /* * n > 9 and > count of parentheses, * then treat as octal instead. */ if ((num > 9) && (num > state.parenCount)) { state.cp = termStart; num = 0; while (state.cp < state.cpend) { c = src [state.cp]; if ((c >= '0') && (c <= '7')) { state.cp++; tmp = 8 * num + (c - '0'); if (tmp > 255) break; num = tmp; } else break; } c = (char)(num); doFlat (state, c); break; } /* otherwise, it's a back-reference */ state.result = new RENode (REOP_BACKREF); state.result.parenIndex = num - 1; state.progLength += 3; break; /* Control escape */ case 'f': c = (char)(0xC); doFlat (state, c); break; case 'n': c = (char)(0xA); doFlat (state, c); break; case 'r': c = (char)(0xD); doFlat (state, c); break; case 't': c = (char)(0x9); doFlat (state, c); break; case 'v': c = (char)(0xB); doFlat (state, c); break; /* Control letter */ case 'c': if ((state.cp < state.cpend) && char.IsLetter (src [state.cp])) c = (char)(src [state.cp++] & 0x1F); else { /* back off to accepting the original '\' as a literal */ --state.cp; c = '\\'; } doFlat (state, c); break; /* UnicodeEscapeSequence */ case 'u': nDigits += 2; // fall thru... /* HexEscapeSequence */ goto case 'x'; case 'x': { int n = 0; int i; for (i = 0; (i < nDigits) && (state.cp < state.cpend); i++) { c = src [state.cp++]; n = ScriptConvert.XDigitToInt (c, n); if (n < 0) { // Back off to accepting the original // 'u' or 'x' as a literal state.cp -= (i + 2); n = src [state.cp++]; break; } } c = (char)(n); } doFlat (state, c); break; /* Character class escapes */ case 'd': state.result = new RENode (REOP_DIGIT); state.progLength++; break; case 'D': state.result = new RENode (REOP_NONDIGIT); state.progLength++; break; case 's': state.result = new RENode (REOP_SPACE); state.progLength++; break; case 'S': state.result = new RENode (REOP_NONSPACE); state.progLength++; break; case 'w': state.result = new RENode (REOP_ALNUM); state.progLength++; break; case 'W': state.result = new RENode (REOP_NONALNUM); state.progLength++; break; /* IdentityEscape */ default: state.result = new RENode (REOP_FLAT); state.result.chr = c; state.result.length = 1; state.result.flatIndex = state.cp - 1; state.progLength += 3; break; } break; } else { /* a trailing '\' is an error */ reportError ("msg.trail.backslash", ""); return false; } case '(': { RENode result = null; termStart = state.cp; if (state.cp + 1 < state.cpend && src [state.cp] == '?' && ((c = src [state.cp + 1]) == '=' || c == '!' || c == ':')) { state.cp += 2; if (c == '=') { result = new RENode (REOP_ASSERT); /* ASSERT, <next>, ... ASSERTTEST */ state.progLength += 4; } else if (c == '!') { result = new RENode (REOP_ASSERT_NOT); /* ASSERTNOT, <next>, ... ASSERTNOTTEST */ state.progLength += 4; } } else { result = new RENode (REOP_LPAREN); /* LPAREN, <index>, ... RPAREN, <index> */ state.progLength += 6; result.parenIndex = state.parenCount++; } ++state.parenNesting; if (!parseDisjunction (state)) return false; if (state.cp == state.cpend || src [state.cp] != ')') { reportError ("msg.unterm.paren", ""); return false; } ++state.cp; --state.parenNesting; if (result != null) { result.kid = state.result; state.result = result; } break; } case ')': reportError ("msg.re.unmatched.right.paren", ""); return false; case '[': state.result = new RENode (REOP_CLASS); termStart = state.cp; state.result.startIndex = termStart; while (true) { if (state.cp >= state.cpend) { reportError ("msg.unterm.class", ""); return false; } if (src [state.cp] == '\\') state.cp++; else { if (src [state.cp] == ']') { state.result.kidlen = state.cp - termStart; break; } } state.cp++; } state.result.index = state.classCount++; /* * Call calculateBitmapSize now as we want any errors it finds * to be reported during the parse phase, not at execution. */ if (!calculateBitmapSize (state, state.result, src, termStart, state.cp++)) return false; state.progLength += 3; /* CLASS, <index> */ break; case '.': state.result = new RENode (REOP_DOT); state.progLength++; break; case '*': case '+': case '?': reportError ("msg.bad.quant", Convert.ToString (src [state.cp - 1])); return false; default: state.result = new RENode (REOP_FLAT); state.result.chr = c; state.result.length = 1; state.result.flatIndex = state.cp - 1; state.progLength += 3; break; } term = state.result; if (state.cp == state.cpend) { return true; } bool hasQ = false; switch (src [state.cp]) { case '+': state.result = new RENode (REOP_QUANT); state.result.min = 1; state.result.max = -1; /* <PLUS>, <parencount>, <parenindex>, <next> ... <ENDCHILD> */ state.progLength += 8; hasQ = true; break; case '*': state.result = new RENode (REOP_QUANT); state.result.min = 0; state.result.max = -1; /* <STAR>, <parencount>, <parenindex>, <next> ... <ENDCHILD> */ state.progLength += 8; hasQ = true; break; case '?': state.result = new RENode (REOP_QUANT); state.result.min = 0; state.result.max = 1; /* <OPT>, <parencount>, <parenindex>, <next> ... <ENDCHILD> */ state.progLength += 8; hasQ = true; break; case '{': /* balance '}' */ { int min = 0; int max = -1; int leftCurl = state.cp; /* For Perl etc. compatibility, if quntifier does not match * \{\d+(,\d*)?\} exactly back off from it * being a quantifier, and chew it up as a literal * atom next time instead. */ c = src [++state.cp]; if (isDigit (c)) { ++state.cp; min = getDecimalValue (c, state, 0xFFFF, "msg.overlarge.min"); c = src [state.cp]; if (c == ',') { c = src [++state.cp]; if (isDigit (c)) { ++state.cp; max = getDecimalValue (c, state, 0xFFFF, "msg.overlarge.max"); c = src [state.cp]; if (min > max) { reportError ("msg.max.lt.min", Convert.ToString (src [state.cp])); return false; } } } else { max = min; } /* balance '{' */ if (c == '}') { state.result = new RENode (REOP_QUANT); state.result.min = min; state.result.max = max; // QUANT, <min>, <max>, <parencount>, // <parenindex>, <next> ... <ENDCHILD> state.progLength += 12; hasQ = true; if (state.cp + 1 != state.cpend) { char nc = src [state.cp + 1]; if (nc == '{') { string quant = string.Empty; for (int i = 2; state.cp + i != state.cpend; i++) { if (src [state.cp + i] == '}') break; quant += src [state.cp + i]; } reportError ("msg.bad.quant", quant); } } } } if (!hasQ) { state.cp = leftCurl; } break; } } if (!hasQ) return true; ++state.cp; state.result.kid = term; state.result.parenIndex = parenBaseCount; state.result.parenCount = state.parenCount - parenBaseCount; if ((state.cp < state.cpend) && (src [state.cp] == '?')) { ++state.cp; state.result.greedy = false; } else state.result.greedy = true; return true; }
private static int emitREBytecode(CompilerState state, RECompiled re, int pc, RENode t) { RENode nextAlt; int nextAltFixup, nextTermFixup; sbyte [] program = re.program; while (t != null) { program [pc++] = t.op; switch (t.op) { case REOP_EMPTY: --pc; break; case REOP_ALT: nextAlt = t.kid2; nextAltFixup = pc; /* address of next alternate */ pc += OFFSET_LEN; pc = emitREBytecode (state, re, pc, t.kid); program [pc++] = REOP_JUMP; nextTermFixup = pc; /* address of following term */ pc += OFFSET_LEN; resolveForwardJump (program, nextAltFixup, pc); pc = emitREBytecode (state, re, pc, nextAlt); program [pc++] = REOP_JUMP; nextAltFixup = pc; pc += OFFSET_LEN; resolveForwardJump (program, nextTermFixup, pc); resolveForwardJump (program, nextAltFixup, pc); break; case REOP_FLAT: /* * Consecutize FLAT's if possible. */ if (t.flatIndex != -1) { while ((t.next != null) && (t.next.op == REOP_FLAT) && ((t.flatIndex + t.length) == t.next.flatIndex)) { t.length += t.next.length; t.next = t.next.next; } } if ((t.flatIndex != -1) && (t.length > 1)) { if ((state.flags & JSREG_FOLD) != 0) program [pc - 1] = REOP_FLATi; else program [pc - 1] = REOP_FLAT; pc = addIndex (program, pc, t.flatIndex); pc = addIndex (program, pc, t.length); } else { if (t.chr < 256) { if ((state.flags & JSREG_FOLD) != 0) program [pc - 1] = REOP_FLAT1i; else program [pc - 1] = REOP_FLAT1; program [pc++] = (sbyte)(t.chr); } else { if ((state.flags & JSREG_FOLD) != 0) program [pc - 1] = REOP_UCFLAT1i; else program [pc - 1] = REOP_UCFLAT1; pc = addIndex (program, pc, t.chr); } } break; case REOP_LPAREN: pc = addIndex (program, pc, t.parenIndex); pc = emitREBytecode (state, re, pc, t.kid); program [pc++] = REOP_RPAREN; pc = addIndex (program, pc, t.parenIndex); break; case REOP_BACKREF: pc = addIndex (program, pc, t.parenIndex); break; case REOP_ASSERT: nextTermFixup = pc; pc += OFFSET_LEN; pc = emitREBytecode (state, re, pc, t.kid); program [pc++] = REOP_ASSERTTEST; resolveForwardJump (program, nextTermFixup, pc); break; case REOP_ASSERT_NOT: nextTermFixup = pc; pc += OFFSET_LEN; pc = emitREBytecode (state, re, pc, t.kid); program [pc++] = REOP_ASSERTNOTTEST; resolveForwardJump (program, nextTermFixup, pc); break; case REOP_QUANT: if ((t.min == 0) && (t.max == -1)) program [pc - 1] = (t.greedy) ? REOP_STAR : REOP_MINIMALSTAR; else if ((t.min == 0) && (t.max == 1)) program [pc - 1] = (t.greedy) ? REOP_OPT : REOP_MINIMALOPT; else if ((t.min == 1) && (t.max == -1)) program [pc - 1] = (t.greedy) ? REOP_PLUS : REOP_MINIMALPLUS; else { if (!t.greedy) program [pc - 1] = REOP_MINIMALQUANT; pc = addIndex (program, pc, t.min); // max can be -1 which addIndex does not accept pc = addIndex (program, pc, t.max + 1); } pc = addIndex (program, pc, t.parenCount); pc = addIndex (program, pc, t.parenIndex); nextTermFixup = pc; pc += OFFSET_LEN; pc = emitREBytecode (state, re, pc, t.kid); program [pc++] = REOP_ENDCHILD; resolveForwardJump (program, nextTermFixup, pc); break; case REOP_CLASS: pc = addIndex (program, pc, t.index); re.classList [t.index] = new RECharSet (t.bmsize, t.startIndex, t.kidlen); break; default: break; } t = t.next; } return pc; }