// private methods private void ParseGroup(Group group, RegexOptions options, Assertion assertion) { bool is_top_level = group is RegularExpression; Alternation alternation = null; string literal = null; Group current = new Group(); Expression expr = null; bool closed = false; while (true) { ConsumeWhitespace(IsIgnorePatternWhitespace(options)); if (ptr >= pattern.Length) { break; } // (1) Parse for Expressions char ch = pattern[ptr++]; switch (ch) { case '^': { Position pos = IsMultiline(options) ? Position.StartOfLine : Position.Start; expr = new PositionAssertion(pos); break; } case '$': { Position pos = IsMultiline(options) ? Position.EndOfLine : Position.End; expr = new PositionAssertion(pos); break; } case '.': { Category cat = IsSingleline(options) ? Category.AnySingleline : Category.Any; expr = new CharacterClass(cat, false); break; } case '\\': { int c = ParseEscape(false); if (c >= 0) { ch = (char)c; } else { expr = ParseSpecial(options); if (expr == null) { ch = pattern[ptr++]; // default escape } } break; } case '[': { expr = ParseCharacterClass(options); break; } case '(': { bool ignore = IsIgnoreCase(options); expr = ParseGroupingConstruct(ref options); if (expr == null) { if (literal != null && IsIgnoreCase(options) != ignore) { current.AppendExpression(new Literal(literal, IsIgnoreCase(options))); literal = null; } continue; } break; } case ')': { closed = true; goto EndOfGroup; } case '|': { if (literal != null) { current.AppendExpression(new Literal(literal, IsIgnoreCase(options))); literal = null; } if (assertion != null) { if (assertion.TrueExpression == null) { assertion.TrueExpression = current; } else if (assertion.FalseExpression == null) { assertion.FalseExpression = current; } else { throw NewParseException("Too many | in (?()|)."); } } else { if (alternation == null) { alternation = new Alternation(); } alternation.AddAlternative(current); } current = new Group(); continue; } case '*': case '+': case '?': { throw NewParseException("Bad quantifier."); } default: break; // literal character } ConsumeWhitespace(IsIgnorePatternWhitespace(options)); // (2) Check for Repetitions if (ptr < pattern.Length) { char k = pattern[ptr]; int min = 0, max = 0; bool lazy = false; bool haveRep = false; if (k == '?' || k == '*' || k == '+') { ++ptr; haveRep = true; switch (k) { case '?': min = 0; max = 1; break; case '*': min = 0; max = 0x7fffffff; break; case '+': min = 1; max = 0x7fffffff; break; } } else if (k == '{' && ptr + 1 < pattern.Length) { int saved_ptr = ptr; ++ptr; haveRep = ParseRepetitionBounds(out min, out max, options); if (!haveRep) { ptr = saved_ptr; } } if (haveRep) { ConsumeWhitespace(IsIgnorePatternWhitespace(options)); if (ptr < pattern.Length && pattern[ptr] == '?') { ++ptr; lazy = true; } //It doesn't make sense to assert a given position more than once. bool ignore_repetition = false; if (expr is PositionAssertion) { ignore_repetition = min > 0 && !lazy; max = 1; } if (!ignore_repetition) { Repetition repetition = new Repetition(min, max, lazy); if (expr == null) { repetition.Expression = new Literal(ch.ToString(), IsIgnoreCase(options)); } else { repetition.Expression = expr; } expr = repetition; } } } // (3) Append Expression and/or Literal if (expr == null) { if (literal == null) { literal = ""; } literal += ch; } else { if (literal != null) { current.AppendExpression(new Literal(literal, IsIgnoreCase(options))); literal = null; } current.AppendExpression(expr); expr = null; } if (is_top_level && ptr >= pattern.Length) { goto EndOfGroup; } } EndOfGroup: if (is_top_level && closed) { throw NewParseException("Too many )'s."); } if (!is_top_level && !closed) { throw NewParseException("Not enough )'s."); } // clean up literals and alternations if (literal != null) { current.AppendExpression(new Literal(literal, IsIgnoreCase(options))); } if (assertion != null) { if (assertion.TrueExpression == null) { assertion.TrueExpression = current; } else { assertion.FalseExpression = current; } group.AppendExpression(assertion); } else if (alternation != null) { alternation.AddAlternative(current); group.AppendExpression(alternation); } else { group.AppendExpression(current); } }
private Expression ParseSpecial(RegexOptions options) { int p = ptr; bool ecma = IsECMAScript(options); Expression expr = null; switch (pattern[ptr++]) { // categories case 'd': expr = new CharacterClass(ecma ? Category.EcmaDigit : Category.Digit, false); break; case 'w': expr = new CharacterClass(ecma ? Category.EcmaWord : Category.Word, false); break; case 's': expr = new CharacterClass(ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false); break; case 'p': // this is odd - ECMAScript isn't supposed to support Unicode, // yet \p{..} compiles and runs under the MS implementation // identically to canonical mode. That's why I'm ignoring the // value of ecma here. expr = new CharacterClass(ParseUnicodeCategory(), false); break; case 'D': expr = new CharacterClass(ecma ? Category.EcmaDigit : Category.Digit, true); break; case 'W': expr = new CharacterClass(ecma ? Category.EcmaWord : Category.Word, true); break; case 'S': expr = new CharacterClass(ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true); break; case 'P': expr = new CharacterClass(ParseUnicodeCategory(), true); break; // positions case 'A': expr = new PositionAssertion(Position.StartOfString); break; case 'Z': expr = new PositionAssertion(Position.End); break; case 'z': expr = new PositionAssertion(Position.EndOfString); break; case 'G': expr = new PositionAssertion(Position.StartOfScan); break; case 'b': expr = new PositionAssertion(Position.Boundary); break; case 'B': expr = new PositionAssertion(Position.NonBoundary); break; // references case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { ptr--; int n = ParseNumber(10, 1, 0); if (n < 0) { ptr = p; return(null); } // FIXME test if number is within number of assigned groups // this may present a problem for right-to-left matching Reference reference = new BackslashNumber(IsIgnoreCase(options), ecma); refs.Add(reference, n.ToString()); expr = reference; break; } case 'k': { char delim = pattern[ptr++]; if (delim == '<') { delim = '>'; } else if (delim != '\'') { throw NewParseException("Malformed \\k<...> named backreference."); } string name = ParseName(); if (name == null || pattern[ptr] != delim) { throw NewParseException("Malformed \\k<...> named backreference."); } ++ptr; Reference reference = new Reference(IsIgnoreCase(options)); refs.Add(reference, name); expr = reference; break; } default: expr = null; break; } if (expr == null) { ptr = p; } return(expr); }