private Expression ParseSpecial (RegexOptions options) { int p = ptr; bool ecma = IsECMAScript (options); Expression expr = null; switch (pattern[ptr ++]) { // categories case 'd': expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false); break; case 'w': expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false); break; case 's': expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false); break; case 'p': // this is odd - ECMAScript isn't supposed to support Unicode, // yet \p{..} compiles and runs under the MS implementation // identically to canonical mode. That's why I'm ignoring the // value of ecma here. expr = new CharacterClass (ParseUnicodeCategory (), false); break; case 'D': expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true); break; case 'W': expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true); break; case 'S': expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true); break; case 'P': expr = new CharacterClass (ParseUnicodeCategory (), true); break; // positions case 'A': expr = new PositionAssertion (Position.StartOfString); break; case 'Z': expr = new PositionAssertion (Position.End); break; case 'z': expr = new PositionAssertion (Position.EndOfString); break; case 'G': expr = new PositionAssertion (Position.StartOfScan); break; case 'b': expr = new PositionAssertion (Position.Boundary); break; case 'B': expr = new PositionAssertion (Position.NonBoundary); break; // references case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { ptr --; int n = ParseNumber (10, 1, 0); if (n < 0) { ptr = p; return null; } // FIXME test if number is within number of assigned groups // this may present a problem for right-to-left matching Reference reference = new BackslashNumber (IsIgnoreCase (options), ecma); refs.Add (reference, n.ToString ()); expr = reference; break; } case 'k': { char delim = pattern[ptr ++]; if (delim == '<') delim = '>'; else if (delim != '\'') throw NewParseException ("Malformed \\k<...> named backreference."); string name = ParseName (); if (name == null || pattern[ptr] != delim) throw NewParseException ("Malformed \\k<...> named backreference."); ++ ptr; Reference reference = new Reference (IsIgnoreCase (options)); refs.Add (reference, name); expr = reference; break; } default: expr = null; break; } if (expr == null) ptr = p; return expr; }
// private methods private void ParseGroup (Group group, RegexOptions options, Assertion assertion) { bool is_top_level = group is RegularExpression; Alternation alternation = null; string literal = null; Group current = new Group (); Expression expr = null; bool closed = false; while (true) { ConsumeWhitespace (IsIgnorePatternWhitespace (options)); if (ptr >= pattern.Length) break; // (1) Parse for Expressions char ch = pattern[ptr ++]; switch (ch) { case '^': { Position pos = IsMultiline (options) ? Position.StartOfLine : Position.Start; expr = new PositionAssertion (pos); break; } case '$': { Position pos = IsMultiline (options) ? Position.EndOfLine : Position.End; expr = new PositionAssertion (pos); break; } case '.': { Category cat = IsSingleline (options) ? Category.AnySingleline : Category.Any; expr = new CharacterClass (cat, false); break; } case '\\': { int c = ParseEscape (false); if (c >= 0) ch = (char)c; else { expr = ParseSpecial (options); if (expr == null) ch = pattern[ptr ++]; // default escape } break; } case '[': { expr = ParseCharacterClass (options); break; } case '(': { bool ignore = IsIgnoreCase (options); expr = ParseGroupingConstruct (ref options); if (expr == null) { if (literal != null && IsIgnoreCase (options) != ignore) { current.AppendExpression (new Literal (literal, IsIgnoreCase (options))); literal = null; } continue; } break; } case ')': { closed = true; goto EndOfGroup; } case '|': { if (literal != null) { current.AppendExpression (new Literal (literal, IsIgnoreCase (options))); literal = null; } if (assertion != null) { if (assertion.TrueExpression == null) assertion.TrueExpression = current; else if (assertion.FalseExpression == null) assertion.FalseExpression = current; else throw NewParseException ("Too many | in (?()|)."); } else { if (alternation == null) alternation = new Alternation (); alternation.AddAlternative (current); } current = new Group (); continue; } case '*': case '+': case '?': { throw NewParseException ("Bad quantifier."); } default: break; // literal character } ConsumeWhitespace (IsIgnorePatternWhitespace (options)); // (2) Check for Repetitions if (ptr < pattern.Length) { char k = pattern[ptr]; int min = 0, max = 0; bool lazy = false; bool haveRep = false; if (k == '?' || k == '*' || k == '+') { ++ ptr; haveRep = true; switch (k) { case '?': min = 0; max = 1; break; case '*': min = 0; max = 0x7fffffff; break; case '+': min = 1; max = 0x7fffffff; break; } } else if (k == '{' && ptr + 1 < pattern.Length) { int saved_ptr = ptr; ++ptr; haveRep = ParseRepetitionBounds (out min, out max, options); if (!haveRep) ptr = saved_ptr; } if (haveRep) { ConsumeWhitespace (IsIgnorePatternWhitespace (options)); if (ptr < pattern.Length && pattern[ptr] == '?') { ++ ptr; lazy = true; } //It doesn't make sense to assert a given position more than once. bool ignore_repetition = false; if (expr is PositionAssertion) { ignore_repetition = min > 0 && !lazy; max = 1; } if (!ignore_repetition) { Repetition repetition = new Repetition (min, max, lazy); if (expr == null) repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options)); else repetition.Expression = expr; expr = repetition; } } } // (3) Append Expression and/or Literal if (expr == null) { if (literal == null) literal = ""; literal += ch; } else { if (literal != null) { current.AppendExpression (new Literal (literal, IsIgnoreCase (options))); literal = null; } current.AppendExpression (expr); expr = null; } if (is_top_level && ptr >= pattern.Length) goto EndOfGroup; } EndOfGroup: if (is_top_level && closed) throw NewParseException ("Too many )'s."); if (!is_top_level && !closed) throw NewParseException ("Not enough )'s."); // clean up literals and alternations if (literal != null) current.AppendExpression (new Literal (literal, IsIgnoreCase (options))); if (assertion != null) { if (assertion.TrueExpression == null) assertion.TrueExpression = current; else assertion.FalseExpression = current; group.AppendExpression (assertion); } else if (alternation != null) { alternation.AddAlternative (current); group.AppendExpression (alternation); } else group.AppendExpression (current); }
private Expression ParseCharacterClass (RegexOptions options) { bool negate = false; if (pattern[ptr] == '^') { negate = true; ++ ptr; } bool ecma = IsECMAScript (options); CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options)); if (pattern[ptr] == ']') { cls.AddCharacter (']'); ++ ptr; } int c = -1; int last = -1; bool range = false; bool closed = false; while (ptr < pattern.Length) { c = pattern[ptr ++]; if (c == ']') { closed = true; break; } if (c == '-' && last >= 0 && !range) { range = true; continue; } if (c == '\\') { c = ParseEscape (true); if (c >= 0) goto char_recognized; // didn't recognize escape c = pattern [ptr ++]; switch (c) { case 'b': c = '\b'; goto char_recognized; case 'd': case 'D': cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, c == 'D'); break; case 'w': case 'W': cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, c == 'W'); break; case 's': case 'S': cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, c == 'S'); break; case 'p': case 'P': cls.AddCategory (ParseUnicodeCategory (), c == 'P'); // ignore ecma break; default: // add escaped character goto char_recognized; } // if the pattern looks like [a-\s] ... if (range) throw NewParseException ("character range cannot have category \\" + c); last = -1; continue; } char_recognized: if (range) { // if 'range' is true, we know that 'last >= 0' if (c < last) throw NewParseException ("[" + last + "-" + c + "] range in reverse order."); cls.AddRange ((char)last, (char)c); last = -1; range = false; continue; } cls.AddCharacter ((char)c); last = c; } if (!closed) throw NewParseException ("Unterminated [] set."); if (range) cls.AddCharacter ('-'); return cls; }
private Expression ParseCharacterClass (RegexOptions options) { bool negate, ecma; if (pattern[ptr] == '^') { negate = true; ++ ptr; } else negate = false; ecma = IsECMAScript (options); CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options)); if (pattern[ptr] == ']') { cls.AddCharacter (']'); ++ ptr; } int c = -1; int last = -1; bool range = false; bool closed = false; while (ptr < pattern.Length) { c = pattern[ptr ++]; if (c == ']') { closed = true; break; } if (c == '-') { range = true; continue; } if (c == '\\') { c = ParseEscape (); if (c < 0) { // didn't recognize escape c = pattern[ptr ++]; switch (c) { case 'b': c = '\b'; break; case 'd': cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, false); last = -1; continue; case 'w': cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, false); last = -1; continue; case 's': cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false); last = -1; continue; case 'p': cls.AddCategory (ParseUnicodeCategory (), false); // ignore ecma last = -1; continue; case 'D': cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, true); last = -1; continue; case 'W': cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, true); last = -1; continue; case 'S': cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true); last = -1; continue; case 'P': cls.AddCategory (ParseUnicodeCategory (), true); last = -1; continue; default: break; // add escaped character } } } if (range) { if (c < last) throw NewParseException ("[x-y] range in reverse order."); if (last >=0 ) cls.AddRange ((char)last, (char)c); else { cls.AddCharacter ((char)c); cls.AddCharacter ('-'); } range = false; last = -1; } else { cls.AddCharacter ((char)c); last = c; } } if (!closed) throw NewParseException ("Unterminated [] set."); if (range) cls.AddCharacter ('-'); return cls; }