// private methods private void ParseGroup(Group group, RegexOptions options, Assertion assertion) { bool is_top_level = group is RegularExpression; Alternation alternation = null; string literal = null; Group current = new Group(); Expression expr = null; bool closed = false; while (true) { ConsumeWhitespace(IsIgnorePatternWhitespace(options)); if (ptr >= pattern.Length) { break; } // (1) Parse for Expressions char ch = pattern[ptr++]; switch (ch) { case '^': { Position pos = IsMultiline(options) ? Position.StartOfLine : Position.Start; expr = new PositionAssertion(pos); break; } case '$': { Position pos = IsMultiline(options) ? Position.EndOfLine : Position.End; expr = new PositionAssertion(pos); break; } case '.': { Category cat = IsSingleline(options) ? Category.AnySingleline : Category.Any; expr = new CharacterClass(cat, false); break; } case '\\': { int c = ParseEscape(); if (c >= 0) { ch = (char)c; } else { expr = ParseSpecial(options); if (expr == null) { ch = pattern[ptr++]; // default escape } } break; } case '[': { expr = ParseCharacterClass(options); break; } case '(': { bool ignore = IsIgnoreCase(options); expr = ParseGroupingConstruct(ref options); if (expr == null) { if (literal != null && IsIgnoreCase(options) != ignore) { current.AppendExpression(new Literal(literal, IsIgnoreCase(options))); literal = null; } continue; } break; } case ')': { closed = true; goto EndOfGroup; } case '|': { if (literal != null) { current.AppendExpression(new Literal(literal, IsIgnoreCase(options))); literal = null; } if (assertion != null) { if (assertion.TrueExpression == null) { assertion.TrueExpression = current; } else if (assertion.FalseExpression == null) { assertion.FalseExpression = current; } else { throw NewParseException("Too many | in (?()|)."); } } else { if (alternation == null) { alternation = new Alternation(); } alternation.AddAlternative(current); } current = new Group(); continue; } case '*': case '+': case '?': { throw NewParseException("Bad quantifier."); } default: break; // literal character } ConsumeWhitespace(IsIgnorePatternWhitespace(options)); // (2) Check for Repetitions if (ptr < pattern.Length) { char k = pattern[ptr]; int min = 0, max = 0; bool lazy = false; bool haveRep = false; if (k == '?' || k == '*' || k == '+') { ++ptr; haveRep = true; switch (k) { case '?': min = 0; max = 1; break; case '*': min = 0; max = 0x7fffffff; break; case '+': min = 1; max = 0x7fffffff; break; } } else if (k == '{' && ptr + 1 < pattern.Length) { int saved_ptr = ptr; ++ptr; haveRep = ParseRepetitionBounds(out min, out max, options); if (!haveRep) { ptr = saved_ptr; } } if (haveRep) { ConsumeWhitespace(IsIgnorePatternWhitespace(options)); if (ptr < pattern.Length && pattern[ptr] == '?') { ++ptr; lazy = true; } //It doesn't make sense to assert a given position more than once. bool ignore_repetition = false; if (expr is PositionAssertion) { ignore_repetition = min > 0 && !lazy; max = 1; } if (!ignore_repetition) { Repetition repetition = new Repetition(min, max, lazy); if (expr == null) { repetition.Expression = new Literal(ch.ToString(), IsIgnoreCase(options)); } else { repetition.Expression = expr; } expr = repetition; } } } // (3) Append Expression and/or Literal if (expr == null) { if (literal == null) { literal = ""; } literal += ch; } else { if (literal != null) { current.AppendExpression(new Literal(literal, IsIgnoreCase(options))); literal = null; } current.AppendExpression(expr); expr = null; } if (is_top_level && ptr >= pattern.Length) { goto EndOfGroup; } } EndOfGroup: if (is_top_level && closed) { throw NewParseException("Too many )'s."); } if (!is_top_level && !closed) { throw NewParseException("Not enough )'s."); } // clean up literals and alternations if (literal != null) { current.AppendExpression(new Literal(literal, IsIgnoreCase(options))); } if (assertion != null) { if (assertion.TrueExpression == null) { assertion.TrueExpression = current; } else { assertion.FalseExpression = current; } group.AppendExpression(assertion); } else if (alternation != null) { alternation.AddAlternative(current); group.AppendExpression(alternation); } else { group.AppendExpression(current); } }
private Expression ParseSpecial(RegexOptions options) { int p = ptr; bool ecma = IsECMAScript(options); Expression expr = null; switch (pattern[ptr++]) { // categories case 'd': expr = new CharacterClass(ecma ? Category.EcmaDigit : Category.Digit, false); break; case 'w': expr = new CharacterClass(ecma ? Category.EcmaWord : Category.Word, false); break; case 's': expr = new CharacterClass(ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false); break; case 'p': // this is odd - ECMAScript isn't supposed to support Unicode, // yet \p{..} compiles and runs under the MS implementation // identically to canonical mode. That's why I'm ignoring the // value of ecma here. expr = new CharacterClass(ParseUnicodeCategory(), false); break; case 'D': expr = new CharacterClass(ecma ? Category.EcmaDigit : Category.Digit, true); break; case 'W': expr = new CharacterClass(ecma ? Category.EcmaWord : Category.Word, true); break; case 'S': expr = new CharacterClass(ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true); break; case 'P': expr = new CharacterClass(ParseUnicodeCategory(), true); break; // positions case 'A': expr = new PositionAssertion(Position.StartOfString); break; case 'Z': expr = new PositionAssertion(Position.End); break; case 'z': expr = new PositionAssertion(Position.EndOfString); break; case 'G': expr = new PositionAssertion(Position.StartOfScan); break; case 'b': expr = new PositionAssertion(Position.Boundary); break; case 'B': expr = new PositionAssertion(Position.NonBoundary); break; // references case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { ptr--; int n = ParseNumber(10, 1, 0); if (n < 0) { ptr = p; return(null); } // FIXME test if number is within number of assigned groups // this may present a problem for right-to-left matching Reference reference = new BackslashNumber(IsIgnoreCase(options), ecma); refs.Add(reference, n.ToString()); expr = reference; break; } case 'k': { char delim = pattern[ptr++]; if (delim == '<') { delim = '>'; } else if (delim != '\'') { throw NewParseException("Malformed \\k<...> named backreference."); } string name = ParseName(); if (name == null || pattern[ptr] != delim) { throw NewParseException("Malformed \\k<...> named backreference."); } ++ptr; Reference reference = new Reference(IsIgnoreCase(options)); refs.Add(reference, name); expr = reference; break; } default: expr = null; break; } if (expr == null) { ptr = p; } return(expr); }
private Expression ParseSpecial (RegexOptions options) { int p = ptr; bool ecma = IsECMAScript (options); Expression expr = null; switch (pattern[ptr ++]) { // categories case 'd': expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false); break; case 'w': expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false); break; case 's': expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false); break; case 'p': // this is odd - ECMAScript isn't supposed to support Unicode, // yet \p{..} compiles and runs under the MS implementation // identically to canonical mode. That's why I'm ignoring the // value of ecma here. expr = new CharacterClass (ParseUnicodeCategory (), false); break; case 'D': expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true); break; case 'W': expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true); break; case 'S': expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true); break; case 'P': expr = new CharacterClass (ParseUnicodeCategory (), true); break; // positions case 'A': expr = new PositionAssertion (Position.StartOfString); break; case 'Z': expr = new PositionAssertion (Position.End); break; case 'z': expr = new PositionAssertion (Position.EndOfString); break; case 'G': expr = new PositionAssertion (Position.StartOfScan); break; case 'b': expr = new PositionAssertion (Position.Boundary); break; case 'B': expr = new PositionAssertion (Position.NonBoundary); break; // references case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { ptr --; int n = ParseNumber (10, 1, 0); if (n < 0) { ptr = p; return null; } // FIXME test if number is within number of assigned groups // this may present a problem for right-to-left matching Reference reference = new BackslashNumber (IsIgnoreCase (options), ecma); refs.Add (reference, n.ToString ()); expr = reference; break; } case 'k': { char delim = pattern[ptr ++]; if (delim == '<') delim = '>'; else if (delim != '\'') throw NewParseException ("Malformed \\k<...> named backreference."); string name = ParseName (); if (name == null || pattern[ptr] != delim) throw NewParseException ("Malformed \\k<...> named backreference."); ++ ptr; Reference reference = new Reference (IsIgnoreCase (options)); refs.Add (reference, name); expr = reference; break; } default: expr = null; break; } if (expr == null) ptr = p; return expr; }
// private methods private void ParseGroup (Group group, RegexOptions options, Assertion assertion) { bool is_top_level = group is RegularExpression; Alternation alternation = null; string literal = null; Group current = new Group (); Expression expr = null; bool closed = false; while (true) { ConsumeWhitespace (IsIgnorePatternWhitespace (options)); if (ptr >= pattern.Length) break; // (1) Parse for Expressions char ch = pattern[ptr ++]; switch (ch) { case '^': { Position pos = IsMultiline (options) ? Position.StartOfLine : Position.Start; expr = new PositionAssertion (pos); break; } case '$': { Position pos = IsMultiline (options) ? Position.EndOfLine : Position.End; expr = new PositionAssertion (pos); break; } case '.': { Category cat = IsSingleline (options) ? Category.AnySingleline : Category.Any; expr = new CharacterClass (cat, false); break; } case '\\': { int c = ParseEscape (false); if (c >= 0) ch = (char)c; else { expr = ParseSpecial (options); if (expr == null) ch = pattern[ptr ++]; // default escape } break; } case '[': { expr = ParseCharacterClass (options); break; } case '(': { bool ignore = IsIgnoreCase (options); expr = ParseGroupingConstruct (ref options); if (expr == null) { if (literal != null && IsIgnoreCase (options) != ignore) { current.AppendExpression (new Literal (literal, IsIgnoreCase (options))); literal = null; } continue; } break; } case ')': { closed = true; goto EndOfGroup; } case '|': { if (literal != null) { current.AppendExpression (new Literal (literal, IsIgnoreCase (options))); literal = null; } if (assertion != null) { if (assertion.TrueExpression == null) assertion.TrueExpression = current; else if (assertion.FalseExpression == null) assertion.FalseExpression = current; else throw NewParseException ("Too many | in (?()|)."); } else { if (alternation == null) alternation = new Alternation (); alternation.AddAlternative (current); } current = new Group (); continue; } case '*': case '+': case '?': { throw NewParseException ("Bad quantifier."); } default: break; // literal character } ConsumeWhitespace (IsIgnorePatternWhitespace (options)); // (2) Check for Repetitions if (ptr < pattern.Length) { char k = pattern[ptr]; int min = 0, max = 0; bool lazy = false; bool haveRep = false; if (k == '?' || k == '*' || k == '+') { ++ ptr; haveRep = true; switch (k) { case '?': min = 0; max = 1; break; case '*': min = 0; max = 0x7fffffff; break; case '+': min = 1; max = 0x7fffffff; break; } } else if (k == '{' && ptr + 1 < pattern.Length) { int saved_ptr = ptr; ++ptr; haveRep = ParseRepetitionBounds (out min, out max, options); if (!haveRep) ptr = saved_ptr; } if (haveRep) { ConsumeWhitespace (IsIgnorePatternWhitespace (options)); if (ptr < pattern.Length && pattern[ptr] == '?') { ++ ptr; lazy = true; } //It doesn't make sense to assert a given position more than once. bool ignore_repetition = false; if (expr is PositionAssertion) { ignore_repetition = min > 0 && !lazy; max = 1; } if (!ignore_repetition) { Repetition repetition = new Repetition (min, max, lazy); if (expr == null) repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options)); else repetition.Expression = expr; expr = repetition; } } } // (3) Append Expression and/or Literal if (expr == null) { if (literal == null) literal = ""; literal += ch; } else { if (literal != null) { current.AppendExpression (new Literal (literal, IsIgnoreCase (options))); literal = null; } current.AppendExpression (expr); expr = null; } if (is_top_level && ptr >= pattern.Length) goto EndOfGroup; } EndOfGroup: if (is_top_level && closed) throw NewParseException ("Too many )'s."); if (!is_top_level && !closed) throw NewParseException ("Not enough )'s."); // clean up literals and alternations if (literal != null) current.AppendExpression (new Literal (literal, IsIgnoreCase (options))); if (assertion != null) { if (assertion.TrueExpression == null) assertion.TrueExpression = current; else assertion.FalseExpression = current; group.AppendExpression (assertion); } else if (alternation != null) { alternation.AddAlternative (current); group.AppendExpression (alternation); } else group.AppendExpression (current); }
private Expression ParseSpecial(RegexOptions options) { int num = ptr; bool flag = IsECMAScript(options); Expression expression = null; switch (pattern[ptr++]) { case 'd': expression = new CharacterClass((!flag) ? Category.Digit : Category.EcmaDigit, negate: false); break; case 'w': expression = new CharacterClass((!flag) ? Category.Word : Category.EcmaWord, negate: false); break; case 's': expression = new CharacterClass((!flag) ? Category.WhiteSpace : Category.EcmaWhiteSpace, negate: false); break; case 'p': expression = new CharacterClass(ParseUnicodeCategory(), negate: false); break; case 'D': expression = new CharacterClass((!flag) ? Category.Digit : Category.EcmaDigit, negate: true); break; case 'W': expression = new CharacterClass((!flag) ? Category.Word : Category.EcmaWord, negate: true); break; case 'S': expression = new CharacterClass((!flag) ? Category.WhiteSpace : Category.EcmaWhiteSpace, negate: true); break; case 'P': expression = new CharacterClass(ParseUnicodeCategory(), negate: true); break; case 'A': expression = new PositionAssertion(Position.StartOfString); break; case 'Z': expression = new PositionAssertion(Position.End); break; case 'z': expression = new PositionAssertion(Position.EndOfString); break; case 'G': expression = new PositionAssertion(Position.StartOfScan); break; case 'b': expression = new PositionAssertion(Position.Boundary); break; case 'B': expression = new PositionAssertion(Position.NonBoundary); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { ptr--; int num2 = ParseNumber(10, 1, 0); if (num2 < 0) { ptr = num; return(null); } Reference reference2 = new BackslashNumber(IsIgnoreCase(options), flag); refs.Add(reference2, num2.ToString()); expression = reference2; break; } case 'k': { char c = pattern[ptr++]; switch (c) { case '<': c = '>'; break; default: throw NewParseException("Malformed \\k<...> named backreference."); case '\'': break; } string text = ParseName(); if (text == null || pattern[ptr] != c) { throw NewParseException("Malformed \\k<...> named backreference."); } ptr++; Reference reference = new Reference(IsIgnoreCase(options)); refs.Add(reference, text); expression = reference; break; } default: expression = null; break; } if (expression == null) { ptr = num; } return(expression); }
private void ParseGroup(Group group, RegexOptions options, Assertion assertion) { bool flag = group is RegularExpression; Alternation alternation = null; string text = null; Group group2 = new Group(); Expression expression = null; bool flag2 = false; while (true) { ConsumeWhitespace(IsIgnorePatternWhitespace(options)); if (ptr >= pattern.Length) { break; } char c = pattern[ptr++]; switch (c) { case '^': { Position pos = (!IsMultiline(options)) ? Position.Start : Position.StartOfLine; expression = new PositionAssertion(pos); goto default; } case '$': { Position pos2 = (!IsMultiline(options)) ? Position.End : Position.EndOfLine; expression = new PositionAssertion(pos2); goto default; } case '.': { Category cat = (!IsSingleline(options)) ? Category.Any : Category.AnySingleline; expression = new CharacterClass(cat, negate: false); goto default; } case '\\': { int num = ParseEscape(); if (num >= 0) { c = (char)num; } else { expression = ParseSpecial(options); if (expression == null) { c = pattern[ptr++]; } } goto default; } case '[': expression = ParseCharacterClass(options); goto default; case '(': { bool flag4 = IsIgnoreCase(options); expression = ParseGroupingConstruct(ref options); if (expression == null) { if (text != null && IsIgnoreCase(options) != flag4) { group2.AppendExpression(new Literal(text, IsIgnoreCase(options))); text = null; } continue; } goto default; } case ')': flag2 = true; break; case '|': if (text != null) { group2.AppendExpression(new Literal(text, IsIgnoreCase(options))); text = null; } if (assertion != null) { if (assertion.TrueExpression == null) { assertion.TrueExpression = group2; } else { if (assertion.FalseExpression != null) { throw NewParseException("Too many | in (?()|)."); } assertion.FalseExpression = group2; } } else { if (alternation == null) { alternation = new Alternation(); } alternation.AddAlternative(group2); } group2 = new Group(); continue; case '*': case '+': case '?': throw NewParseException("Bad quantifier."); default: ConsumeWhitespace(IsIgnorePatternWhitespace(options)); if (ptr < pattern.Length) { char c2 = pattern[ptr]; int min = 0; int max = 0; bool lazy = false; bool flag3 = false; switch (c2) { case '*': case '+': case '?': ptr++; flag3 = true; switch (c2) { case '?': min = 0; max = 1; break; case '*': min = 0; max = int.MaxValue; break; case '+': min = 1; max = int.MaxValue; break; } break; case '{': if (ptr + 1 < pattern.Length) { int num2 = ptr; ptr++; flag3 = ParseRepetitionBounds(out min, out max, options); if (!flag3) { ptr = num2; } } break; } if (flag3) { ConsumeWhitespace(IsIgnorePatternWhitespace(options)); if (ptr < pattern.Length && pattern[ptr] == '?') { ptr++; lazy = true; } Repetition repetition = new Repetition(min, max, lazy); if (expression == null) { repetition.Expression = new Literal(c.ToString(), IsIgnoreCase(options)); } else { repetition.Expression = expression; } expression = repetition; } } if (expression == null) { if (text == null) { text = string.Empty; } text += c; } else { if (text != null) { group2.AppendExpression(new Literal(text, IsIgnoreCase(options))); text = null; } group2.AppendExpression(expression); expression = null; } if (flag && ptr >= pattern.Length) { break; } continue; } break; } if (flag && flag2) { throw NewParseException("Too many )'s."); } if (!flag && !flag2) { throw NewParseException("Not enough )'s."); } if (text != null) { group2.AppendExpression(new Literal(text, IsIgnoreCase(options))); } if (assertion != null) { if (assertion.TrueExpression == null) { assertion.TrueExpression = group2; } else { assertion.FalseExpression = group2; } group.AppendExpression(assertion); } else if (alternation != null) { alternation.AddAlternative(group2); group.AppendExpression(alternation); } else { group.AppendExpression(group2); } }