/// <summary> /// Converts Perl match expression (only, without delimiters, options etc.) to .NET regular expression. /// </summary> /// <param name="perlExpr">Perl regular expression to convert.</param> /// <param name="opt">Regexp options - some of them must be processed by changes in match string.</param> /// <returns>Resulting .NET regular expression.</returns> private string ConvertRegex(string perlExpr, PerlRegexOptions opt) { // Ranges in bracket expressions should be replaced with appropriate characters // assume no conversion will be performed, create string builder with exact length. Only in // case there is a range StringBuilder would be prolonged, +1 for Anchored StringBuilder result = new StringBuilder(perlExpr.Length + 1); // Anchored means that the string should match only at the start of the string, add '^' // at the beginning if there is no one if ((opt & PerlRegexOptions.Anchored) != 0 && (perlExpr.Length == 0 || perlExpr[0] != '^')) result.Append('^'); // set to true after a quantifier is matched, if there is second quantifier just behind the // first it is an error bool last_quantifier = false; // 4 means we're switching from 3 back to 2 - ie. "a-b-c" // (we need to make a difference here because second "-" shouldn't be expanded) bool leaving_range = false; // remember the last character added in the character class, so in state 3 we can expand the range as properly as possible int range_from_character = -1; bool escaped = false; int state = 0; int inner_state = 0; HashSet<uint> addedSurrogate2Ranges = null; // cache of already added character pairs valid within character class [], dropped when switching to 0 int group_number = 0; int i = 0; while (i < perlExpr.Length) { int ch = perlExpr[i]; escaped = false; if (ch == '\\' && !ParseEscapeCode(/*encoding,*/ perlExpr, ref i, ref ch, ref escaped)) { i++; Debug.Assert(i < perlExpr.Length, "Regex cannot end with backslash."); ch = perlExpr[i]; if (ch == 'g') { ++i; inner_state = 5; // skip 'g' from resulting pattern escaped = false; continue; } else if (ch == 'k') { inner_state = 11; escaped = true; } // some characters (like '_') don't need to be escaped in .net // and ignore escaping of unicode sequence of characters if (ch == '_' || (int)ch > 0x7F) escaped = false; else escaped = true; } switch (state) { case 0: // outside of character class if (escaped) { result.Append('\\'); Append(result, ch); last_quantifier = false; break; } // In perl regexps, named groups are written like this: "(?P<name> ... )" // (\k<name>...) // (\k'name'...) // (\k{name}...) // (\g{name}...) // (?'name'...) // (?<name>...) // (?P=name) // (?:...) // If the group is starting here, we need to skip the 'P' character (see state 4) switch (inner_state) { case 0: if (ch == '(') { inner_state = 1; // Look-ahead and name anonymous groups. // This is used to match the order of the results. // As perlre doc says: // NOTE: While the notation of this construct [grouping] is the same as the similar function in .NET regexes, // the behavior is not. In Perl the groups are numbered sequentially regardless of being named or not. ++group_number; if (i + 1 < perlExpr.Length) { if (perlExpr[i + 1] != '?') { ++i; result.Append("(?<"); result.Append(AnonymousGroupPrefix); result.Append(group_number); result.Append('>'); continue; } else if (i + 2 < perlExpr.Length && perlExpr[i + 2] == ':') { // Pseudo-group, don't count. --group_number; } } } else if (ch == '\\') inner_state = 4; else inner_state = 0; break; //groups case 1: if (ch == '?') inner_state = 2; else if (ch != '(')// stay in inner_state == 1, because this can happen: ((?<blah>...)) inner_state = 0; break; case 2: if (ch == 'P') { i++; inner_state = 3; continue; //skip 'P' from resulting pattern } else if (ch == '<') { inner_state = 15; break; } else if (ch == '\'') { i++; result.Append('\''); result.Append(GroupPrefix); inner_state = 0; continue; } inner_state = 0; break; case 3: // '(?P' if (ch == '=') { ++i; inner_state = 12; continue; //skip '=' from resulting pattern } else if (ch != '<')// if P wasn't part of "(?P<name> ... )" neither '(?P=name)' back reference, so put it back to the pattern { result.Append('P'); } else if (ch == '<') { i++; result.Append('<'); result.Append(GroupPrefix); inner_state = 0; continue; } inner_state = 0; break; // /g[0-9]{1,2} back references case 5: // '\g' result.Append('\\'); if (ch == '{') { i++; inner_state = 6; continue; // skip '{' from resulting pattern } else if (ch >= '0' && ch <= '9') { inner_state = 0; // just copy the rest of the pattern } else { result.Append('g'); // unexpected character after '/g', so put g back to pattern inner_state = 0; } break; case 6: // '\g{' if (ch >= '0' && ch <= '9') { inner_state = 7; } else { // it can be named group result.Append("k<"); result.Append(GroupPrefix); inner_state = 10; //result.Append("g{"); // unexpected character after '/g{', so put it back to pattern //group_state = 0; } break; case 7:// '\g{[0-9]' if (ch == '}') { i++; inner_state = 9; continue; // skip '}' from resulting pattern } else if (ch >= '0' && ch <= '9') { inner_state = 8; } else { //name of the group starts with a number //put behind PreGroupNameSign result.Insert(result.Length - 1,"k<"); result.Insert(result.Length - 1, GroupPrefix); inner_state = 14; } break; case 8: // '\g{[0-9][0-9]' if (ch == '}') { i++; inner_state = 9; continue; // skip '}' from resulting pattern } else { //name of the group starts with a number //put behind PreGroupNameSign result.Insert(result.Length - 1, "k<"); result.Insert(result.Length - 2, GroupPrefix); inner_state = 14; } // there is just 99 back references possible inner_state = 0; break; case 9:// '\g{[0-9][0-9]?}' if (ch >= '0' && ch <= '9') { result.Append("(?#)"); // put this to the resulting pattern to separate number of the reference from number that follows } inner_state = 0; break; // named back references case 10:// '\g{.*?}' | '\k{.*?}' if (ch == '}') { ++i; result.Append('>'); inner_state = 0; continue; // skip '}' from resulting pattern } break; case 11:// '\k' if (ch == '{') { i++; inner_state = 10; result.Append('<'); result.Append(GroupPrefix); continue; // skip '{' from resulting pattern } else if (ch == '<') { i++; result.Append('<'); result.Append(GroupPrefix); inner_state = 0; continue; } else if (ch == '\'') { i++; result.Append('\''); result.Append(GroupPrefix); inner_state = 0; continue; } inner_state = 0; break; // transforming '(?P=name)' to '\k<name>' case 12: // '(?P=' // (? was already put in the pattern, so replace it with '\k' result[result.Length - 2] = '\\'; result[result.Length - 1] = 'k'; // add '<' so it is '\k<' result.Append('<'); result.Append(GroupPrefix); inner_state = 13; break; case 13: // '(?P=.*?' if (ch == ')') { ++i; result.Append('>'); inner_state = 0; continue; // skip ')' from resulting pattern } break; case 14:// '\g{[0-9].*?' if (ch == '}') { i++; inner_state = 9; result.Append(">"); continue; // skip '}' from resulting pattern } break; case 15:// (?< //Add group prefix only if it's not lookbehind assertions //(?<! negative //(?<= positive if (ch != '!' && ch != '=') { result.Append(GroupPrefix); } inner_state = 0; break; default: inner_state = 0; break; } if ((opt & PerlRegexOptions.Ungreedy) != 0) { // match quantifier ?,*,+,{n,m} at the position i: Match m = quantifiers.Match(perlExpr, i); // quantifier matched; quentifier '?' hasn't to be preceded by '(' - a grouping construct '(?' if (m.Success && (m.Value != "?" || i == 0 || perlExpr[i - 1] != '(')) { // two quantifiers: if (last_quantifier) throw new ArgumentException(LibResources.GetString("regexp_duplicate_quantifier", i)); // append quantifier: result.Append(perlExpr, i, m.Length); i += m.Length; if (i < perlExpr.Length && perlExpr[i] == '?') { // skip question mark to make the quantifier greedy: i++; } else if (i < perlExpr.Length && perlExpr[i] == '+') { // TODO: we do not yet support possesive quantifiers // so we just skip the attribute it and pray // nobody will ever realize :-) i++; } else { // add question mark to make the quantifier lazy: if (result.Length != 0 && result[result.Length - 1] == '?') { // HACK: Due to the issue in .NET regex we can't use "??" because it isn't interpreted correctly!! // (for example "^(ab)??$" matches with "abab", but it shouldn't!!) } else result.Append('?'); } last_quantifier = true; continue; } } last_quantifier = false; if (ch == '$' && (opt & PerlRegexOptions.DollarMatchesEndOfStringOnly) != 0) { // replaces '$' with '\z': result.Append(@"\z"); break; } if (ch == '[') state = 1; Append(result, ch); break; case 1: // first character of character class if (escaped) { result.Append('\\'); Append(result, ch); range_from_character = ch; state = 2; break; } // special characters: if (ch == '^' || ch == ']' || ch == '-') { Append(result, ch); } else { // other characters are not consumed here, for example [[:space:]abc] will not match if the first // [ is appended here. state = 2; goto case 2; } break; case 2: // inside of character class if (escaped) { result.Append('\\'); Append(result, ch); range_from_character = ch; leaving_range = false; break; } if (ch == '-' && !leaving_range) { state = 3; break; } leaving_range = false; // posix character classes Match match = posixCharClasses.Match(perlExpr.Substring(i), 0); if (match.Success) { string chars = PosixRegExp.BracketExpression.CountCharacterClass(match.Groups[2].Value); if (chars == null) throw new ArgumentException(/*TODO*/ String.Format("Unknown character class '{0}'", match.Groups[2].Value)); if (match.Groups[1].Value.Length > 0) throw new ArgumentException(/*TODO*/ "POSIX character classes negation not supported."); result.Append(chars); range_from_character = -1; // -1 means, it is not rangable :) i += match.Length - 1; // +1 is added just behind the switch break; } if (ch == ']') { addedSurrogate2Ranges = null; // drop the cache of ranges state = 0; } // append <ch> range_from_character = ch; if (ch == '-') result.Append("\\x2d"); else AppendEscaped(result, ch); break; case 3: // range previous character was '-' if (!escaped && ch == ']') { if (range_from_character > char.MaxValue) throw new ArgumentException("Cannot range from an UTF-32 character to unknown."); result.Append("-]"); addedSurrogate2Ranges = null; // drop the cache of ranges state = 0; break; } //string range; //int error; //if (!PosixRegExp.BracketExpression.CountRange(result[result.Length - 1], ch, out range, out error)) //{ // if ((error != 1) || (!CountUnicodeRange(result[result.Length - 1], ch, out range))) // { // Debug.Assert(error == 2); // throw new ArgumentException(LibResources.GetString("range_first_character_greater")); // } //} //PosixRegExp.BracketExpression.EscapeBracketExpressionSpecialChars(result, range); // left boundary is duplicated, but doesn't matter... if (addedSurrogate2Ranges == null) addedSurrogate2Ranges = new HashSet<uint>(); // initialize the cache of already added character ranges, invalidated at the end of character class if (ch != range_from_character) { // <from>-<ch>: // 1. <utf16>-<utf16> // 2. <utf16>-<utf32> // 3. <utf32>-<utf32> if (range_from_character <= char.MaxValue) { if (ch <= char.MaxValue) { //symbol order can be different, not testet with other modes var seqBreak = false; byte from = 0; byte to = 0; if (encoding.IsSingleByte) { var bytes = encoding.GetBytes(new char[] {(char) range_from_character}); from = bytes[0]; bytes = encoding.GetBytes(new char[] {(char)ch}); to = bytes[0]; var lastChar = range_from_character; for (int j = from + 1; j <= to; j++) { var chars = encoding.GetChars(new[] {(byte)j}); if (chars[0] - lastChar != 1) { seqBreak = true; break; } lastChar = chars[0]; } } // 1. if (!seqBreak) { result.Append('-'); AppendEscaped(result, ch); } else { for (byte b = (byte)(from + 1); b <= to; b++) { var chars = encoding.GetChars(new[] { b }); AppendEscaped(result, chars[0]); } } } else { // 2. result.Append('-'); AppendEscaped(result, char.MaxValue); // count <char.MaxValue+1>-<ch> CountUTF32Range(result, char.MaxValue + 1, ch, addedSurrogate2Ranges); } } else { // 3. utf32 range result.Length -= 2; CountUTF32Range(result, range_from_character, ch, addedSurrogate2Ranges); } } state = 2; leaving_range = true; range_from_character = -1; break; } i++; } return ConvertPossesiveToAtomicGroup(result); }
private static void ParseRegexOptions(StringUtils.UniformWrapper pattern, int start, out RegexOptions dotNetOptions, out PerlRegexOptions extraOptions) { dotNetOptions = RegexOptions.None; extraOptions = PerlRegexOptions.None; for (int i = start; i < pattern.Length; i++) { char option = pattern[i]; switch (option) { case 'i': // PCRE_CASELESS dotNetOptions |= RegexOptions.IgnoreCase; break; case 'm': // PCRE_MULTILINE dotNetOptions |= RegexOptions.Multiline; break; case 's': // PCRE_DOTALL dotNetOptions |= RegexOptions.Singleline; break; case 'x': // PCRE_EXTENDED dotNetOptions |= RegexOptions.IgnorePatternWhitespace; break; case 'e': // evaluate as PHP code extraOptions |= PerlRegexOptions.Evaluate; break; case 'A': // PCRE_ANCHORED extraOptions |= PerlRegexOptions.Anchored; break; case 'D': // PCRE_DOLLAR_ENDONLY extraOptions |= PerlRegexOptions.DollarMatchesEndOfStringOnly; break; case 'S': // spend more time studying the pattern - ignore break; case 'U': // PCRE_UNGREEDY extraOptions |= PerlRegexOptions.Ungreedy; break; case 'u': // PCRE_UTF8 extraOptions |= PerlRegexOptions.UTF8; break; case 'X': // PCRE_EXTRA PhpException.Throw(PhpError.Warning, LibResources.GetString("modifier_not_supported", option)); break; default: PhpException.Throw(PhpError.Notice, LibResources.GetString("modifier_unknown", option)); break; } } // inconsistent options check: if ( (dotNetOptions & RegexOptions.Multiline) != 0 && (extraOptions & PerlRegexOptions.DollarMatchesEndOfStringOnly) != 0 ) { PhpException.Throw(PhpError.Notice, LibResources.GetString("modifiers_inconsistent", 'D', 'm')); } }
/// <summary> /// Converts Perl match expression (only, without delimiters, options etc.) to .NET regular expression. /// </summary> /// <param name="perlExpr">Perl regular expression to convert.</param> /// <param name="opt">Regexp options - some of them must be processed by changes in match string.</param> /// <param name="encoding">Encoding.</param> /// <returns>Resulting .NET regular expression.</returns> private static string ConvertRegex(string perlExpr, PerlRegexOptions opt, Encoding/*!*/ encoding) { // Ranges in bracket expressions should be replaced with appropriate characters // assume no conversion will be performed, create string builder with exact length. Only in // case there is a range StringBuilder would be prolonged, +1 for Anchored StringBuilder result = new StringBuilder(perlExpr.Length + 1); // Anchored means that the string should match only at the start of the string, add '^' // at the beginning if there is no one if ((opt & PerlRegexOptions.Anchored) != 0 && (perlExpr.Length == 0 || perlExpr[0] != '^')) result.Append('^'); // set to true after a quantifier is matched, if there is second quantifier just behind the // first it is an error bool last_quantifier = false; // 4 means we're switching from 3 back to 2 - ie. "a-b-c" // (we need to make a difference here because second "-" shouldn't be expanded) bool leaving_range = false; bool escaped = false; int state = 0; int group_state = 0; int i = 0; while (i < perlExpr.Length) { char ch = perlExpr[i]; escaped = false; if (ch == '\\' && !ParseEscapeCode(encoding, perlExpr, ref i, ref ch, ref escaped)) { i++; //Debug.Assert(i < perlExpr.Length, "Regex cannot end with backslash."); ch = perlExpr[i]; // some characters (like '_') don't need to be escaped in .net if (ch == '_') escaped = false; else escaped = true; } switch (state) { case 0: // outside of character class if (escaped) { result.Append('\\'); result.Append(ch); last_quantifier = false; break; } // In perl regexps, named groups are written like this: "(?P<name> ... )" // If the group is starting here, we need to skip the 'P' character (see state 4) switch (group_state) { case 0: group_state = (ch == '(') ? 1 : 0; break; case 1: group_state = (ch == '?') ? 2 : 0; break; case 2: if (ch == 'P') { i++; continue; } break; } if ((opt & PerlRegexOptions.Ungreedy) != 0) { // match quantifier ?,*,+,{n,m} at the position i: Match m = quantifiers.Match(perlExpr, i); // quantifier matched; quentifier '?' hasn't to be preceded by '(' - a grouping construct '(?' if (m.Success && (m.Value != "?" || i == 0 || perlExpr[i - 1] != '(')) { // two quantifiers: if (last_quantifier) throw new ArgumentException("regexp_duplicate_quantifier"); // append quantifier: result.Append(perlExpr, i, m.Length); i += m.Length; if (i < perlExpr.Length && perlExpr[i] == '?') { // skip question mark to make the quantifier greedy: i++; } else if (i < perlExpr.Length && perlExpr[i] == '+') { // TODO: we do not yet support possesive quantifiers // so we just skip the attribute it and pray // nobody will ever realize :-) i++; } else { // add question mark to make the quantifier lazy: if (result.Length != 0 && result[result.Length - 1] == '?') { // HACK: Due to the issue in .NET regex we can't use "??" because it isn't interpreted correctly!! // (for example "^(ab)??$" matches with "abab", but it shouldn't!!) } else result.Append('?'); } last_quantifier = true; continue; } } last_quantifier = false; if (ch == '$' && (opt & PerlRegexOptions.DollarMatchesEndOfStringOnly) != 0) { // replaces '$' with '\z': result.Append(@"\z"); break; } if (ch == '[') state = 1; result.Append(ch); break; case 1: // first character of character class if (escaped) { result.Append('\\'); result.Append(ch); state = 2; break; } // special characters: if (ch == '^' || ch == ']' || ch == '-') { result.Append(ch); } else { // other characters are not consumed here, for example [[:space:]abc] will not match if the first // [ is appended here. state = 2; goto case 2; } break; case 2: // inside of character class if (escaped) { result.Append('\\'); result.Append(ch); leaving_range = false; break; } if (ch == '-' && !leaving_range) { state = 3; break; } leaving_range = false; // posix character classes Match match = posixCharClasses.Match(perlExpr.Substring(i), 0); if (match.Success) { string chars = CountCharacterClass(match.Groups[2].Value); if (chars == null) throw new ArgumentException(/*TODO*/ String.Format("Unknown character class '{0}'", match.Groups[2].Value)); if (match.Groups[1].Value.Length > 0) throw new ArgumentException(/*TODO*/ "POSIX character classes negation not supported."); result.Append(chars); i += match.Length - 1; // +1 is added just behind the switch break; } if (ch == ']') state = 0; if (ch == '-') result.Append("\\x2d"); else result.Append(ch); break; case 3: // range previous character was '-' if (!escaped && ch == ']') { result.Append("-]"); state = 0; break; } string range; int error; if (!CountRange(result[result.Length - 1], ch, out range, out error, encoding)) { if ((error != 1) || (!CountUnicodeRange(result[result.Length - 1], ch, out range))) { //Debug.Assert(error == 2); throw new ArgumentException("range_first_character_greater"); } } result.Append(EscapeBracketExpressionSpecialChars(range)); // left boundary is duplicated, but doesn't matter... state = 2; leaving_range = true; break; } i++; } return result.ToString(); }
private static void ParseRegexOptions(StringBuilder pattern, int start, out RegexOptions dotNetOptions, out PerlRegexOptions extraOptions) { dotNetOptions = RegexOptions.None; extraOptions = PerlRegexOptions.None; for (int i = start; i < pattern.Length; i++) { char option = pattern[i]; switch (option) { case 'i': // PCRE_CASELESS dotNetOptions |= RegexOptions.IgnoreCase; break; case 'm': // PCRE_MULTILINE dotNetOptions |= RegexOptions.Multiline; break; case 's': // PCRE_DOTALL dotNetOptions |= RegexOptions.Singleline; break; case 'x': // PCRE_EXTENDED dotNetOptions |= RegexOptions.IgnorePatternWhitespace; break; case 'e': // evaluate as PHP code extraOptions |= PerlRegexOptions.Evaluate; break; case 'A': // PCRE_ANCHORED extraOptions |= PerlRegexOptions.Anchored; break; case 'D': // PCRE_DOLLAR_ENDONLY extraOptions |= PerlRegexOptions.DollarMatchesEndOfStringOnly; break; case 'S': // spend more time studythe pattern - ignore break; case 'U': // PCRE_UNGREEDY extraOptions |= PerlRegexOptions.Ungreedy; break; case 'u': // PCRE_UTF8 extraOptions |= PerlRegexOptions.Utf8; break; case 'X': // PCRE_EXTRA throw new Exception("Modifier not supported"); break; default: throw new Exception("Modifier unknown"); break; } } // inconsistent options check: if ( (dotNetOptions & RegexOptions.Multiline) != 0 && (extraOptions & PerlRegexOptions.DollarMatchesEndOfStringOnly) != 0 ) { throw new Exception("Modifier inconsistent"); } }
/// <summary> /// Converts Perl match expression (only, without delimiters, options etc.) to .NET regular expression. /// </summary> /// <param name="perlExpr">Perl regular expression to convert.</param> /// <param name="opt">Regexp options - some of them must be processed by changes in match string.</param> /// <param name="encoding">Encoding.</param> /// <returns>Resulting .NET regular expression.</returns> private static string ConvertRegex(string perlExpr, PerlRegexOptions opt, Encoding /*!*/ encoding) { // Ranges in bracket expressions should be replaced with appropriate characters // assume no conversion will be performed, create string builder with exact length. Only in // case there is a range StringBuilder would be prolonged, +1 for Anchored StringBuilder result = new StringBuilder(perlExpr.Length + 1); // Anchored means that the string should match only at the start of the string, add '^' // at the beginning if there is no one if ((opt & PerlRegexOptions.Anchored) != 0 && (perlExpr.Length == 0 || perlExpr[0] != '^')) { result.Append('^'); } // set to true after a quantifier is matched, if there is second quantifier just behind the // first it is an error bool last_quantifier = false; // 4 means we're switching from 3 back to 2 - ie. "a-b-c" // (we need to make a difference here because second "-" shouldn't be expanded) bool leaving_range = false; bool escaped = false; int state = 0; int group_state = 0; int i = 0; while (i < perlExpr.Length) { char ch = perlExpr[i]; escaped = false; if (ch == '\\' && !ParseEscapeCode(encoding, perlExpr, ref i, ref ch, ref escaped)) { i++; //Debug.Assert(i < perlExpr.Length, "Regex cannot end with backslash."); ch = perlExpr[i]; // some characters (like '_') don't need to be escaped in .net if (ch == '_') { escaped = false; } else { escaped = true; } } switch (state) { case 0: // outside of character class if (escaped) { result.Append('\\'); result.Append(ch); last_quantifier = false; break; } // In perl regexps, named groups are written like this: "(?P<name> ... )" // If the group is starting here, we need to skip the 'P' character (see state 4) switch (group_state) { case 0: group_state = (ch == '(') ? 1 : 0; break; case 1: group_state = (ch == '?') ? 2 : 0; break; case 2: if (ch == 'P') { i++; continue; } break; } if ((opt & PerlRegexOptions.Ungreedy) != 0) { // match quantifier ?,*,+,{n,m} at the position i: Match m = quantifiers.Match(perlExpr, i); // quantifier matched; quentifier '?' hasn't to be preceded by '(' - a grouping construct '(?' if (m.Success && (m.Value != "?" || i == 0 || perlExpr[i - 1] != '(')) { // two quantifiers: if (last_quantifier) { throw new ArgumentException("regexp_duplicate_quantifier"); } // append quantifier: result.Append(perlExpr, i, m.Length); i += m.Length; if (i < perlExpr.Length && perlExpr[i] == '?') { // skip question mark to make the quantifier greedy: i++; } else if (i < perlExpr.Length && perlExpr[i] == '+') { // TODO: we do not yet support possesive quantifiers // so we just skip the attribute it and pray // nobody will ever realize :-) i++; } else { // add question mark to make the quantifier lazy: if (result.Length != 0 && result[result.Length - 1] == '?') { // HACK: Due to the issue in .NET regex we can't use "??" because it isn't interpreted correctly!! // (for example "^(ab)??$" matches with "abab", but it shouldn't!!) } else { result.Append('?'); } } last_quantifier = true; continue; } } last_quantifier = false; if (ch == '$' && (opt & PerlRegexOptions.DollarMatchesEndOfStringOnly) != 0) { // replaces '$' with '\z': result.Append(@"\z"); break; } if (ch == '[') { state = 1; } result.Append(ch); break; case 1: // first character of character class if (escaped) { result.Append('\\'); result.Append(ch); state = 2; break; } // special characters: if (ch == '^' || ch == ']' || ch == '-') { result.Append(ch); } else { // other characters are not consumed here, for example [[:space:]abc] will not match if the first // [ is appended here. state = 2; goto case 2; } break; case 2: // inside of character class if (escaped) { result.Append('\\'); result.Append(ch); leaving_range = false; break; } if (ch == '-' && !leaving_range) { state = 3; break; } leaving_range = false; // posix character classes Match match = posixCharClasses.Match(perlExpr.Substring(i), 0); if (match.Success) { string chars = CountCharacterClass(match.Groups[2].Value); if (chars == null) { throw new ArgumentException(/*TODO*/ String.Format("Unknown character class '{0}'", match.Groups[2].Value)); } if (match.Groups[1].Value.Length > 0) { throw new ArgumentException(/*TODO*/ "POSIX character classes negation not supported."); } result.Append(chars); i += match.Length - 1; // +1 is added just behind the switch break; } if (ch == ']') { state = 0; } if (ch == '-') { result.Append("\\x2d"); } else { result.Append(ch); } break; case 3: // range previous character was '-' if (!escaped && ch == ']') { result.Append("-]"); state = 0; break; } string range; int error; if (!CountRange(result[result.Length - 1], ch, out range, out error, encoding)) { if ((error != 1) || (!CountUnicodeRange(result[result.Length - 1], ch, out range))) { //Debug.Assert(error == 2); throw new ArgumentException("range_first_character_greater"); } } result.Append(EscapeBracketExpressionSpecialChars(range)); // left boundary is duplicated, but doesn't matter... state = 2; leaving_range = true; break; } i++; } return(result.ToString()); }
private static void ParseRegexOptions(StringBuilder pattern, int start, out RegexOptions dotNetOptions, out PerlRegexOptions extraOptions) { dotNetOptions = RegexOptions.None; extraOptions = PerlRegexOptions.None; for (int i = start; i < pattern.Length; i++) { char option = pattern[i]; switch (option) { case 'i': // PCRE_CASELESS dotNetOptions |= RegexOptions.IgnoreCase; break; case 'm': // PCRE_MULTILINE dotNetOptions |= RegexOptions.Multiline; break; case 's': // PCRE_DOTALL dotNetOptions |= RegexOptions.Singleline; break; case 'x': // PCRE_EXTENDED dotNetOptions |= RegexOptions.IgnorePatternWhitespace; break; case 'e': // evaluate as PHP code extraOptions |= PerlRegexOptions.Evaluate; break; case 'A': // PCRE_ANCHORED extraOptions |= PerlRegexOptions.Anchored; break; case 'D': // PCRE_DOLLAR_ENDONLY extraOptions |= PerlRegexOptions.DollarMatchesEndOfStringOnly; break; case 'S': // spend more time studythe pattern - ignore break; case 'U': // PCRE_UNGREEDY extraOptions |= PerlRegexOptions.Ungreedy; break; case 'u': // PCRE_UTF8 extraOptions |= PerlRegexOptions.UTF8; break; /* * case 'X': // PCRE_EXTRA * throw new Exception("Modifier not supported"); * * * default: * throw new Exception("Modifier unknown"); */ } } // inconsistent options check: if ( (dotNetOptions & RegexOptions.Multiline) != 0 && (extraOptions & PerlRegexOptions.DollarMatchesEndOfStringOnly) != 0 ) { throw new Exception("Modifier inconsistent"); } }