/* * Initialize the character set if it this is the first call. * Test the bit - if the ^ flag was specified, non-inclusion is a success */ private static bool classMatcher(REGlobalData gData, RECharSet charSet, char ch) { if (!charSet.converted) { processCharSet (gData, charSet); } int byteIndex = ch / 8; if (charSet.sense) { if ((charSet.length == 0) || ((ch > charSet.length) || ((charSet.bits [byteIndex] & (1 << (ch & 0x7))) == 0))) return false; } else { if (!((charSet.length == 0) || ((ch > charSet.length) || ((charSet.bits [byteIndex] & (1 << (ch & 0x7))) == 0)))) return false; } return true; }
/* * indexp is assumed to be an array of length 1 */ internal virtual object executeRegExp(Context cx, IScriptable scopeObj, RegExpImpl res, string str, int [] indexp, int matchType) { REGlobalData gData = new REGlobalData (); int start = indexp [0]; char [] charArray = str.ToCharArray (); int end = charArray.Length; if (start > end) start = end; // // Call the recursive matcher to do the real work. // bool matches = matchRegExp (gData, re, charArray, start, end, res.multiline); if (!matches) { if (matchType != PREFIX) return null; return Undefined.Value; } int index = gData.cp; int i = index; indexp [0] = i; int matchlen = i - (start + gData.skipped); int ep = index; index -= matchlen; object result; IScriptable obj; if (matchType == TEST) { /* * Testing for a match and updating cx.regExpImpl: don't allocate * an array object, do return true. */ result = true; obj = null; } else { /* * The array returned on match has element 0 bound to the matched * string, elements 1 through re.parenCount bound to the paren * matches, an index property telling the length of the left context, * and an input property referring to the input string. */ IScriptable scope = GetTopLevelScope (scopeObj); result = ScriptRuntime.NewObject (cx, scope, "Array", null); obj = (IScriptable)result; string matchstr = new string (charArray, index, matchlen); obj.Put (0, obj, matchstr); } if (re.parenCount == 0) { res.parens = null; res.lastParen = SubString.EmptySubString; } else { SubString parsub = null; int num; res.parens = new SubString [re.parenCount]; for (num = 0; num < re.parenCount; num++) { int cap_index = gData.parens_index (num); string parstr; if (cap_index != -1) { int cap_length = gData.parens_length (num); parsub = new SubString (charArray, cap_index, cap_length); res.parens [num] = parsub; if (matchType == TEST) continue; parstr = parsub.ToString (); obj.Put (num + 1, obj, parstr); } else { if (matchType != TEST) obj.Put (num + 1, obj, Undefined.Value); } } res.lastParen = parsub; } if (!(matchType == TEST)) { /* * Define the index and input properties last for better for/in loop * order (so they come after the elements). */ obj.Put ("index", obj, (object)(start + gData.skipped)); obj.Put ("input", obj, str); } if (res.lastMatch == null) { res.lastMatch = new SubString (); res.leftContext = new SubString (); res.rightContext = new SubString (); } res.lastMatch.charArray = charArray; res.lastMatch.index = index; res.lastMatch.length = matchlen; res.leftContext.charArray = charArray; if (cx.Version == Context.Versions.JS1_2) { /* * JS1.2 emulated Perl4.0.1.8 (patch level 36) for global regexps used * in scalar contexts, and unintentionally for the string.match "list" * psuedo-context. On "hi there bye", the following would result: * * Language while(/ /g){print("$`");} s/ /$`/g * perl4.036 "hi", "there" "hihitherehi therebye" * perl5 "hi", "hi there" "hihitherehi therebye" * js1.2 "hi", "there" "hihitheretherebye" * * Insofar as JS1.2 always defined $` as "left context from the last * match" for global regexps, it was more consistent than perl4. */ res.leftContext.index = start; res.leftContext.length = gData.skipped; } else { /* * For JS1.3 and ECMAv2, emulate Perl5 exactly: * * js1.3 "hi", "hi there" "hihitherehi therebye" */ res.leftContext.index = 0; res.leftContext.length = start + gData.skipped; } res.rightContext.charArray = charArray; res.rightContext.index = ep; res.rightContext.length = end - ep; return result; }
/* 1. Evaluate DecimalEscape to obtain an EscapeValue E. 2. If E is not a character then go to step 6. 3. Let ch be E's character. 4. Let A be a one-element RECharSet containing the character ch. 5. Call CharacterSetMatcher(A, false) and return its Matcher result. 6. E must be an integer. Let n be that integer. 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception. 8. Return an internal Matcher closure that takes two arguments, a State x and a Continuation c, and performs the following: 1. Let cap be x's captures internal array. 2. Let s be cap[n]. 3. If s is undefined, then call c(x) and return its result. 4. Let e be x's endIndex. 5. Let len be s's length. 6. Let f be e+len. 7. If f>InputLength, return failure. 8. If there exists an integer i between 0 (inclusive) and len (exclusive) such that Canonicalize(s[i]) is not the same character as Canonicalize(Input [e+i]), then return failure. 9. Let y be the State (f, cap). 10. Call c(y) and return its result. */ private static bool backrefMatcher(REGlobalData gData, int parenIndex, char [] chars, int end) { int len; int i; int parenContent = gData.parens_index (parenIndex); if (parenContent == -1) return true; len = gData.parens_length (parenIndex); if ((gData.cp + len) > end) return false; if ((gData.regexp.flags & JSREG_FOLD) != 0) { for (i = 0; i < len; i++) { if (upcase (chars [parenContent + i]) != upcase (chars [gData.cp + i])) return false; } } else { for (i = 0; i < len; i++) { if (chars [parenContent + i] != chars [gData.cp + i]) return false; } } gData.cp += len; return true; }
private static void pushProgState(REGlobalData gData, int min, int max, REBackTrackData backTrackLastToSave, int continuation_pc, int continuation_op) { gData.stateStackTop = new REProgState (gData.stateStackTop, min, max, gData.cp, backTrackLastToSave, continuation_pc, continuation_op); }
internal REProgState stateStackTop; /* state of op that backtracked */ #endregion Fields #region Constructors internal REBackTrackData(REGlobalData gData, int op, int pc) { previous = gData.backTrackStackTop; continuation_op = op; continuation_pc = pc; lastParen = gData.lastParen; if (gData.parens != null) { parens = new long [gData.parens.Length]; gData.parens.CopyTo (parens, 0); } cp = gData.cp; stateStackTop = gData.stateStackTop; }
private static void processCharSetImpl(REGlobalData gData, RECharSet charSet) { int src = charSet.startIndex; int end = src + charSet.strlength; char rangeStart = (char)(0), thisCh; int byteLength; char c; int n; int nDigits; int i; bool inRange = false; charSet.sense = true; byteLength = (charSet.length / 8) + 1; charSet.bits = new sbyte [byteLength]; if (src == end) return; if (gData.regexp.source [src] == '^') { charSet.sense = false; ++src; } while (src != end) { nDigits = 2; switch (gData.regexp.source [src]) { case '\\': ++src; c = gData.regexp.source [src++]; switch (c) { case 'b': thisCh = (char)(0x8); break; case 'f': thisCh = (char)(0xC); break; case 'n': thisCh = (char)(0xA); break; case 'r': thisCh = (char)(0xD); break; case 't': thisCh = (char)(0x9); break; case 'v': thisCh = (char)(0xB); break; case 'c': if (((src + 1) < end) && isWord (gData.regexp.source [src + 1])) thisCh = (char)(gData.regexp.source [src++] & 0x1F); else { --src; thisCh = '\\'; } break; case 'u': nDigits += 2; // fall thru goto case 'x'; case 'x': n = 0; for (i = 0; (i < nDigits) && (src < end); i++) { c = gData.regexp.source [src++]; int digit = toASCIIHexDigit (c); if (digit < 0) { /* back off to accepting the original '\' * as a literal */ src -= (i + 1); n = '\\'; break; } n = (n << 4) | digit; } thisCh = (char)(n); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': /* * This is a non-ECMA extension - decimal escapes (in this * case, octal!) are supposed to be an error inside class * ranges, but supported here for backwards compatibility. * */ n = (c - '0'); c = gData.regexp.source [src]; if ('0' <= c && c <= '7') { src++; n = 8 * n + (c - '0'); c = gData.regexp.source [src]; if ('0' <= c && c <= '7') { src++; i = 8 * n + (c - '0'); if (i <= 255) n = i; else src--; } } thisCh = (char)(n); break; case 'd': addCharacterRangeToCharSet (charSet, '0', '9'); continue; /* don't need range processing */ case 'D': addCharacterRangeToCharSet (charSet, (char)0, (char)('0' - 1)); addCharacterRangeToCharSet (charSet, (char)('9' + 1), (char)(charSet.length)); continue; case 's': for (i = (int)(charSet.length); i >= 0; i--) if (isREWhiteSpace (i)) addCharacterToCharSet (charSet, (char)(i)); continue; case 'S': for (i = (int)(charSet.length); i >= 0; i--) if (!isREWhiteSpace (i)) addCharacterToCharSet (charSet, (char)(i)); continue; case 'w': for (i = (int)(charSet.length); i >= 0; i--) if (isWord ((char)i)) addCharacterToCharSet (charSet, (char)(i)); continue; case 'W': for (i = (int)(charSet.length); i >= 0; i--) if (!isWord ((char)i)) addCharacterToCharSet (charSet, (char)(i)); continue; default: thisCh = c; break; } break; default: thisCh = gData.regexp.source [src++]; break; } if (inRange) { if ((gData.regexp.flags & JSREG_FOLD) != 0) { addCharacterRangeToCharSet (charSet, upcase (rangeStart), upcase (thisCh)); addCharacterRangeToCharSet (charSet, downcase (rangeStart), downcase (thisCh)); } else { addCharacterRangeToCharSet (charSet, rangeStart, thisCh); } inRange = false; } else { if ((gData.regexp.flags & JSREG_FOLD) != 0) { addCharacterToCharSet (charSet, upcase (thisCh)); addCharacterToCharSet (charSet, downcase (thisCh)); } else { addCharacterToCharSet (charSet, thisCh); } if (src < (end - 1)) { if (gData.regexp.source [src] == '-') { ++src; inRange = true; rangeStart = thisCh; } } } } }
private static void pushBackTrackState(REGlobalData gData, sbyte op, int target) { gData.backTrackStackTop = new REBackTrackData (gData, op, target); }
/* Compile the source of the class into a RECharSet */ private static void processCharSet(REGlobalData gData, RECharSet charSet) { lock (charSet) { if (!charSet.converted) { processCharSetImpl (gData, charSet); charSet.converted = true; } } }
private static REProgState popProgState(REGlobalData gData) { REProgState state = gData.stateStackTop; gData.stateStackTop = state.previous; return state; }
private static bool matchRegExp(REGlobalData gData, RECompiled re, char [] chars, int start, int end, bool multiline) { if (re.parenCount != 0) { gData.parens = new long [re.parenCount]; } else { gData.parens = null; } gData.backTrackStackTop = null; gData.stateStackTop = null; gData.multiline = multiline; gData.regexp = re; gData.lastParen = 0; int anchorCh = gData.regexp.anchorCh; // // have to include the position beyond the last character // in order to detect end-of-input/line condition // for (int i = start; i <= end; ++i) { // // If the first node is a literal match, step the index into // the string until that match is made, or fail if it can't be // found at all. // if (anchorCh >= 0) { for (; ; ) { if (i == end) { return false; } char matchCh = chars [i]; if (matchCh == anchorCh || ((gData.regexp.flags & JSREG_FOLD) != 0 && upcase (matchCh) == upcase ((char)anchorCh))) { break; } ++i; } } gData.cp = i; for (int j = 0; j < re.parenCount; j++) { gData.set_parens (j, -1, 0); } bool result = executeREBytecode (gData, chars, end); gData.backTrackStackTop = null; gData.stateStackTop = null; if (result) { gData.skipped = i - start; return true; } } return false; }
/* * Consecutive literal characters. */ private static bool flatNMatcher(REGlobalData gData, int matchChars, int length, char [] chars, int end) { if ((gData.cp + length) > end) return false; for (int i = 0; i < length; i++) { if (gData.regexp.source [matchChars + i] != chars [gData.cp + i]) { return false; } } gData.cp += length; return true; }
private static bool executeREBytecode(REGlobalData gData, char [] chars, int end) { int pc = 0; sbyte [] program = gData.regexp.program; int currentContinuation_op; int currentContinuation_pc; bool result = false; currentContinuation_pc = 0; currentContinuation_op = REOP_END; if (debug) { System.Console.Out.WriteLine ("Input = \"" + new string (chars) + "\", start at " + gData.cp); } int op = program [pc++]; for (; ; ) { if (debug) { System.Console.Out.WriteLine ("Testing at " + gData.cp + ", op = " + op); } switch (op) { case REOP_EMPTY: result = true; break; case REOP_BOL: if (gData.cp != 0) { if (gData.multiline || ((gData.regexp.flags & JSREG_MULTILINE) != 0)) { if (!isLineTerm (chars [gData.cp - 1])) { result = false; break; } } else { result = false; break; } } result = true; break; case REOP_EOL: if (gData.cp != end) { if (gData.multiline || ((gData.regexp.flags & JSREG_MULTILINE) != 0)) { if (!isLineTerm (chars [gData.cp])) { result = false; break; } } else { result = false; break; } } result = true; break; case REOP_WBDRY: result = ((gData.cp == 0 || !isWord (chars [gData.cp - 1])) ^ !((gData.cp < end) && isWord (chars [gData.cp]))); break; case REOP_WNONBDRY: result = ((gData.cp == 0 || !isWord (chars [gData.cp - 1])) ^ ((gData.cp < end) && isWord (chars [gData.cp]))); break; case REOP_DOT: result = (gData.cp != end && !isLineTerm (chars [gData.cp])); if (result) { gData.cp++; } break; case REOP_DIGIT: result = (gData.cp != end && isDigit (chars [gData.cp])); if (result) { gData.cp++; } break; case REOP_NONDIGIT: result = (gData.cp != end && !isDigit (chars [gData.cp])); if (result) { gData.cp++; } break; case REOP_SPACE: result = (gData.cp != end && isREWhiteSpace (chars [gData.cp])); if (result) { gData.cp++; } break; case REOP_NONSPACE: result = (gData.cp != end && !isREWhiteSpace (chars [gData.cp])); if (result) { gData.cp++; } break; case REOP_ALNUM: result = (gData.cp != end && isWord (chars [gData.cp])); if (result) { gData.cp++; } break; case REOP_NONALNUM: result = (gData.cp != end && !isWord (chars [gData.cp])); if (result) { gData.cp++; } break; case REOP_FLAT: { int offset = getIndex (program, pc); pc += INDEX_LEN; int length = getIndex (program, pc); pc += INDEX_LEN; result = flatNMatcher (gData, offset, length, chars, end); } break; case REOP_FLATi: { int offset = getIndex (program, pc); pc += INDEX_LEN; int length = getIndex (program, pc); pc += INDEX_LEN; result = flatNIMatcher (gData, offset, length, chars, end); } break; case REOP_FLAT1: { char matchCh = (char)(program [pc++] & 0xFF); result = (gData.cp != end && chars [gData.cp] == matchCh); if (result) { gData.cp++; } } break; case REOP_FLAT1i: { char matchCh = (char)(program [pc++] & 0xFF); result = (gData.cp != end && upcase (chars [gData.cp]) == upcase (matchCh)); if (result) { gData.cp++; } } break; case REOP_UCFLAT1: { char matchCh = (char)getIndex (program, pc); pc += INDEX_LEN; result = (gData.cp != end && chars [gData.cp] == matchCh); if (result) { gData.cp++; } } break; case REOP_UCFLAT1i: { char matchCh = (char)getIndex (program, pc); pc += INDEX_LEN; result = (gData.cp != end && upcase (chars [gData.cp]) == upcase (matchCh)); if (result) { gData.cp++; } } break; case REOP_ALT: { int nextpc; sbyte nextop; pushProgState (gData, 0, 0, null, currentContinuation_pc, currentContinuation_op); nextpc = pc + getOffset (program, pc); nextop = program [nextpc++]; pushBackTrackState (gData, nextop, nextpc); pc += INDEX_LEN; op = program [pc++]; } continue; case REOP_JUMP: { int offset; REProgState state = popProgState (gData); currentContinuation_pc = state.continuation_pc; currentContinuation_op = state.continuation_op; offset = getOffset (program, pc); pc += offset; op = program [pc++]; } continue; case REOP_LPAREN: { int parenIndex = getIndex (program, pc); pc += INDEX_LEN; gData.set_parens (parenIndex, gData.cp, 0); op = program [pc++]; } continue; case REOP_RPAREN: { int cap_index; int parenIndex = getIndex (program, pc); pc += INDEX_LEN; cap_index = gData.parens_index (parenIndex); gData.set_parens (parenIndex, cap_index, gData.cp - cap_index); if (parenIndex > gData.lastParen) gData.lastParen = parenIndex; op = program [pc++]; } continue; case REOP_BACKREF: { int parenIndex = getIndex (program, pc); pc += INDEX_LEN; result = backrefMatcher (gData, parenIndex, chars, end); } break; case REOP_CLASS: { int index = getIndex (program, pc); pc += INDEX_LEN; if (gData.cp != end) { if (classMatcher (gData, gData.regexp.classList [index], chars [gData.cp])) { gData.cp++; result = true; break; } } result = false; } break; case REOP_ASSERT: case REOP_ASSERT_NOT: { sbyte testOp; pushProgState (gData, 0, 0, gData.backTrackStackTop, currentContinuation_pc, currentContinuation_op); if (op == REOP_ASSERT) { testOp = REOP_ASSERTTEST; } else { testOp = REOP_ASSERTNOTTEST; } pushBackTrackState (gData, testOp, pc + getOffset (program, pc)); pc += INDEX_LEN; op = program [pc++]; } continue; case REOP_ASSERTTEST: case REOP_ASSERTNOTTEST: { REProgState state = popProgState (gData); gData.cp = state.index; gData.backTrackStackTop = state.backTrack; currentContinuation_pc = state.continuation_pc; currentContinuation_op = state.continuation_op; if (result) { if (op == REOP_ASSERTTEST) { result = true; } else { result = false; } } else { if (op == REOP_ASSERTTEST) { // Do nothing } else { result = true; } } } break; case REOP_STAR: case REOP_PLUS: case REOP_OPT: case REOP_QUANT: case REOP_MINIMALSTAR: case REOP_MINIMALPLUS: case REOP_MINIMALOPT: case REOP_MINIMALQUANT: { int min, max; bool greedy = false; switch (op) { case REOP_STAR: greedy = true; // fallthrough goto case REOP_MINIMALSTAR; case REOP_MINIMALSTAR: min = 0; max = -1; break; case REOP_PLUS: greedy = true; // fallthrough goto case REOP_MINIMALPLUS; case REOP_MINIMALPLUS: min = 1; max = -1; break; case REOP_OPT: greedy = true; // fallthrough goto case REOP_MINIMALOPT; case REOP_MINIMALOPT: min = 0; max = 1; break; case REOP_QUANT: greedy = true; // fallthrough goto case REOP_MINIMALQUANT; case REOP_MINIMALQUANT: min = getOffset (program, pc); pc += INDEX_LEN; // See comments in emitREBytecode for " - 1" reason max = getOffset (program, pc) - 1; pc += INDEX_LEN; break; default: throw Context.CodeBug (); } pushProgState (gData, min, max, null, currentContinuation_pc, currentContinuation_op); if (greedy) { currentContinuation_op = REOP_REPEAT; currentContinuation_pc = pc; pushBackTrackState (gData, REOP_REPEAT, pc); /* Step over <parencount>, <parenindex> & <next> */ pc += 3 * INDEX_LEN; op = program [pc++]; } else { if (min != 0) { currentContinuation_op = REOP_MINIMALREPEAT; currentContinuation_pc = pc; /* <parencount> <parenindex> & <next> */ pc += 3 * INDEX_LEN; op = program [pc++]; } else { pushBackTrackState (gData, REOP_MINIMALREPEAT, pc); popProgState (gData); pc += 2 * INDEX_LEN; // <parencount> & <parenindex> pc = pc + getOffset (program, pc); op = program [pc++]; } } } continue; case REOP_ENDCHILD: // Use the current continuation. pc = currentContinuation_pc; op = currentContinuation_op; continue; case REOP_REPEAT: { REProgState state = popProgState (gData); if (!result) { // // There's been a failure, see if we have enough // children. // if (state.min == 0) result = true; currentContinuation_pc = state.continuation_pc; currentContinuation_op = state.continuation_op; pc += 2 * INDEX_LEN; /* <parencount> & <parenindex> */ pc = pc + getOffset (program, pc); break; } else { if (state.min == 0 && gData.cp == state.index) { // matched an empty string, that'll get us nowhere result = false; currentContinuation_pc = state.continuation_pc; currentContinuation_op = state.continuation_op; pc += 2 * INDEX_LEN; pc = pc + getOffset (program, pc); break; } int new_min = state.min, new_max = state.max; if (new_min != 0) new_min--; if (new_max != -1) new_max--; if (new_max == 0) { result = true; currentContinuation_pc = state.continuation_pc; currentContinuation_op = state.continuation_op; pc += 2 * INDEX_LEN; pc = pc + getOffset (program, pc); break; } pushProgState (gData, new_min, new_max, null, state.continuation_pc, state.continuation_op); currentContinuation_op = REOP_REPEAT; currentContinuation_pc = pc; pushBackTrackState (gData, REOP_REPEAT, pc); int parenCount = getIndex (program, pc); pc += INDEX_LEN; int parenIndex = getIndex (program, pc); pc += 2 * INDEX_LEN; op = program [pc++]; for (int k = 0; k < parenCount; k++) { gData.set_parens (parenIndex + k, -1, 0); } } } continue; case REOP_MINIMALREPEAT: { REProgState state = popProgState (gData); if (!result) { // // Non-greedy failure - try to consume another child. // if (state.max == -1 || state.max > 0) { pushProgState (gData, state.min, state.max, null, state.continuation_pc, state.continuation_op); currentContinuation_op = REOP_MINIMALREPEAT; currentContinuation_pc = pc; int parenCount = getIndex (program, pc); pc += INDEX_LEN; int parenIndex = getIndex (program, pc); pc += 2 * INDEX_LEN; for (int k = 0; k < parenCount; k++) { gData.set_parens (parenIndex + k, -1, 0); } op = program [pc++]; continue; } else { // Don't need to adjust pc since we're going to pop. currentContinuation_pc = state.continuation_pc; currentContinuation_op = state.continuation_op; break; } } else { if (state.min == 0 && gData.cp == state.index) { // Matched an empty string, that'll get us nowhere. result = false; currentContinuation_pc = state.continuation_pc; currentContinuation_op = state.continuation_op; break; } int new_min = state.min, new_max = state.max; if (new_min != 0) new_min--; if (new_max != -1) new_max--; pushProgState (gData, new_min, new_max, null, state.continuation_pc, state.continuation_op); if (new_min != 0) { currentContinuation_op = REOP_MINIMALREPEAT; currentContinuation_pc = pc; int parenCount = getIndex (program, pc); pc += INDEX_LEN; int parenIndex = getIndex (program, pc); pc += 2 * INDEX_LEN; for (int k = 0; k < parenCount; k++) { gData.set_parens (parenIndex + k, -1, 0); } op = program [pc++]; } else { currentContinuation_pc = state.continuation_pc; currentContinuation_op = state.continuation_op; pushBackTrackState (gData, REOP_MINIMALREPEAT, pc); popProgState (gData); pc += 2 * INDEX_LEN; pc = pc + getOffset (program, pc); op = program [pc++]; } continue; } } case REOP_END: return true; default: throw Context.CodeBug (); } /* * If the match failed and there's a backtrack option, take it. * Otherwise this is a complete and utter failure. */ if (!result) { REBackTrackData backTrackData = gData.backTrackStackTop; if (backTrackData != null) { gData.backTrackStackTop = backTrackData.previous; gData.lastParen = backTrackData.lastParen; // TODO: If backTrackData will no longer be used, then // TODO: there is no need to clone backTrackData.parens if (backTrackData.parens != null) { gData.parens = new long [backTrackData.parens.Length]; backTrackData.parens.CopyTo (gData.parens, 0); } gData.cp = backTrackData.cp; gData.stateStackTop = backTrackData.stateStackTop; currentContinuation_op = gData.stateStackTop.continuation_op; currentContinuation_pc = gData.stateStackTop.continuation_pc; pc = backTrackData.continuation_pc; op = backTrackData.continuation_op; continue; } else return false; } op = program [pc++]; } }