示例#1
0
		/// <summary>
		/// Converts Perl match expression (only, without delimiters, options etc.) to .NET regular expression.
		/// </summary>
		/// <param name="perlExpr">Perl regular expression to convert.</param>
		/// <param name="opt">Regexp options - some of them must be processed by changes in match string.</param>
		/// <returns>Resulting .NET regular expression.</returns>
		private string ConvertRegex(string perlExpr, PerlRegexOptions opt)
		{
			// Ranges in bracket expressions should be replaced with appropriate characters

			// assume no conversion will be performed, create string builder with exact length. Only in
			// case there is a range StringBuilder would be prolonged, +1 for Anchored
			StringBuilder result = new StringBuilder(perlExpr.Length + 1);

			// Anchored means that the string should match only at the start of the string, add '^'
			// at the beginning if there is no one
			if ((opt & PerlRegexOptions.Anchored) != 0 && (perlExpr.Length == 0 || perlExpr[0] != '^'))
				result.Append('^');

			// set to true after a quantifier is matched, if there is second quantifier just behind the
			// first it is an error
			bool last_quantifier = false;

			// 4 means we're switching from 3 back to 2 - ie. "a-b-c" 
			// (we need to make a difference here because second "-" shouldn't be expanded)
			bool leaving_range = false;

            // remember the last character added in the character class, so in state 3 we can expand the range as properly as possible
            int range_from_character = -1; 

			bool escaped = false;
			int state = 0;
			int inner_state = 0;
            HashSet<uint> addedSurrogate2Ranges = null; // cache of already added character pairs valid within character class [], dropped when switching to 0

            int group_number = 0;
			int i = 0;
			while (i < perlExpr.Length)
			{
				int ch = perlExpr[i];

				escaped = false;
				if (ch == '\\' && !ParseEscapeCode(/*encoding,*/ perlExpr, ref i, ref ch, ref escaped))
				{
					i++;
					Debug.Assert(i < perlExpr.Length, "Regex cannot end with backslash.");
					ch = perlExpr[i];

                    if (ch == 'g')
                    {
                        ++i;
                        inner_state = 5; // skip 'g' from resulting pattern
                        escaped = false; 
                        continue;
                    }
                    else if (ch == 'k')
                    {
                        inner_state = 11;
                        escaped = true;
                    }

					// some characters (like '_') don't need to be escaped in .net
                    // and ignore escaping of unicode sequence of characters
					if (ch == '_' || (int)ch > 0x7F) escaped = false; else escaped = true;
				}

				switch (state)
				{
					case 0: // outside of character class
						if (escaped)
						{
							result.Append('\\');
							Append(result, ch);
							last_quantifier = false; 
							break;
						}

						// In perl regexps, named groups are written like this: "(?P<name> ... )"
                        //  (\k<name>...)
                        //  (\k'name'...)
                        //  (\k{name}...)
                        //  (\g{name}...)
                        //  (?'name'...)
                        //  (?<name>...)
                        //  (?P=name)
                        //  (?:...)
  
						// If the group is starting here, we need to skip the 'P' character (see state 4)
						switch (inner_state)
						{
							case 0:
                                if (ch == '(')
                                {
                                    inner_state = 1;

                                    // Look-ahead and name anonymous groups.
                                    // This is used to match the order of the results.
                                    // As perlre doc says:
                                    // NOTE: While the notation of this construct [grouping] is the same as the similar function in .NET regexes,
                                    // the behavior is not. In Perl the groups are numbered sequentially regardless of being named or not.
                                    ++group_number;
                                    if (i + 1 < perlExpr.Length)
                                    {
                                        if (perlExpr[i + 1] != '?')
                                        {
                                            ++i;
                                            result.Append("(?<");
                                            result.Append(AnonymousGroupPrefix);
                                            result.Append(group_number);
                                            result.Append('>');
                                            continue;
                                        }
                                        else
                                        if (i + 2 < perlExpr.Length && perlExpr[i + 2] == ':')
                                        {
                                            // Pseudo-group, don't count.
                                            --group_number;
                                        }
                                    }
                                }
                                else if (ch == '\\')
                                    inner_state = 4;
                                else
                                    inner_state = 0;
                                
                                break;

                            //groups
							case 1:
                                if (ch == '?')
                                    inner_state = 2;
                                else if (ch != '(')// stay in inner_state == 1, because this can happen: ((?<blah>...))
                                    inner_state = 0;
                                break;
							case 2:
                                if (ch == 'P')
                                {
                                    i++;
                                    inner_state = 3;
                                    continue; //skip 'P' from resulting pattern
                                }
                                else if (ch == '<')
                                {
                                    inner_state = 15;
                                    break;
                                }
                                else if (ch == '\'')
                                {
                                    i++;
                                    result.Append('\'');
                                    result.Append(GroupPrefix);

                                    inner_state = 0;
                                    continue;
                                }
                                
                                inner_state = 0;
                                break;
                            case 3: // '(?P'
                                if (ch == '=') 
                                {
                                    ++i;
                                    inner_state = 12;
                                    continue; //skip '=' from resulting pattern
                                }
                                else if (ch != '<')// if P wasn't part of "(?P<name> ... )" neither '(?P=name)' back reference, so put it back to the pattern
                                {
                                    result.Append('P');
                                }
                                else if (ch == '<')
                                {
                                    i++;
                                    result.Append('<');
                                    result.Append(GroupPrefix);

                                    inner_state = 0;
                                    continue;
                                }

                                inner_state = 0;
                                break;

                            // /g[0-9]{1,2} back references
                            case 5: // '\g'

                                result.Append('\\');

                                if (ch == '{')
                                {
                                    i++;
                                    inner_state = 6;
                                    continue; // skip '{' from resulting pattern
                                }
                                else if (ch >= '0' && ch <= '9')
                                {
                                    inner_state = 0; // just copy the rest of the pattern
                                }
                                else
                                {
                                    result.Append('g'); // unexpected character after '/g', so put g back to pattern
                                    inner_state = 0;
                                }
                                break;
                            case 6: // '\g{'

                                if (ch >= '0' && ch <= '9')
                                {
                                    inner_state = 7;
                                }
                                else
                                {
                                    // it can be named group
                                    result.Append("k<");
                                    result.Append(GroupPrefix);
                                    inner_state = 10;

                                    //result.Append("g{"); // unexpected character after '/g{', so put it back to pattern
                                    //group_state = 0;
                                }

                                break;

                            case 7:// '\g{[0-9]'

                                if (ch == '}')
                                {
                                    i++;
                                    inner_state = 9;
                                    continue; // skip '}' from resulting pattern
                                }
                                else if (ch >= '0' && ch <= '9')
                                {
                                    inner_state = 8;
                                }
                                else
                                {
                                    //name of the group starts with a number
                                    //put behind PreGroupNameSign
                                    result.Insert(result.Length - 1,"k<");
                                    result.Insert(result.Length - 1, GroupPrefix);
                                    inner_state = 14;
                                }



                                break;

                            case 8: // '\g{[0-9][0-9]'

                                if (ch == '}')
                                {
                                    i++;
                                    inner_state = 9;
                                    continue; // skip '}' from resulting pattern
                                }
                                else
                                {
                                    //name of the group starts with a number
                                    //put behind PreGroupNameSign
                                    result.Insert(result.Length - 1, "k<");
                                    result.Insert(result.Length - 2, GroupPrefix);
                                    inner_state = 14;
                                }

                                // there is just 99 back references possible

                                inner_state = 0;

                                break;

                            case 9:// '\g{[0-9][0-9]?}'

                                if (ch >= '0' && ch <= '9')
                                {
                                    result.Append("(?#)"); // put this to the resulting pattern to separate number of the reference from number that follows
                                }

                                inner_state = 0;

                                break;

                            // named back references
                            case 10:// '\g{.*?}' | '\k{.*?}'

                                if (ch == '}')
                                {
                                    ++i;
                                    result.Append('>');
                                    inner_state = 0;
                                    continue; // skip '}' from resulting pattern
                                }

                                break;

                            case 11:// '\k'

                                if (ch == '{')
                                {
                                    i++;
                                    inner_state = 10;
                                    result.Append('<');
                                    result.Append(GroupPrefix);
                                    continue; // skip '{' from resulting pattern
                                }
                                else if (ch == '<')
                                {
                                    i++;
                                    result.Append('<');
                                    result.Append(GroupPrefix);

                                    inner_state = 0;
                                    continue;
                                }
                                else if (ch == '\'')
                                {
                                    i++;
                                    result.Append('\'');
                                    result.Append(GroupPrefix);

                                    inner_state = 0;
                                    continue;
                                }

                                inner_state = 0;

                                break;

                            // transforming '(?P=name)' to '\k<name>'
                            case 12: // '(?P='

                                // (? was already put in the pattern, so replace it with '\k'
                                result[result.Length - 2] = '\\';
                                result[result.Length - 1] = 'k';
                                
                                // add '<' so it is '\k<'
                                result.Append('<');
                                result.Append(GroupPrefix);

                                inner_state = 13;

                                break;

                            case 13: // '(?P=.*?'

                                if (ch == ')')
                                {
                                    ++i;
                                    result.Append('>');
                                    inner_state = 0;
                                    continue; // skip ')' from resulting pattern
                                }

                                break;

                            case 14:// '\g{[0-9].*?'

                                if (ch == '}')
                                {
                                    i++;
                                    inner_state = 9;
                                    result.Append(">");
                                    continue; // skip '}' from resulting pattern
                                }

                                break;

                            case 15:// (?<

                                //Add group prefix only if it's not lookbehind assertions
                                //(?<! negative
                                //(?<= positive
                                if (ch != '!' && ch != '=')
                                {
                                    result.Append(GroupPrefix);

                                }

                                inner_state = 0;

                                break;


                            
                            default: inner_state = 0; break;
						}

						if ((opt & PerlRegexOptions.Ungreedy) != 0)
						{
							// match quantifier ?,*,+,{n,m} at the position i:
							Match m = quantifiers.Match(perlExpr, i);

							// quantifier matched; quentifier '?' hasn't to be preceded by '(' - a grouping construct '(?'
							if (m.Success && (m.Value != "?" || i == 0 || perlExpr[i - 1] != '('))
							{
								// two quantifiers: 
								if (last_quantifier)
									throw new ArgumentException(LibResources.GetString("regexp_duplicate_quantifier", i));

								// append quantifier:
								result.Append(perlExpr, i, m.Length);
								i += m.Length;

								if (i < perlExpr.Length && perlExpr[i] == '?')
								{
									// skip question mark to make the quantifier greedy:
									i++;
								}
                                else if (i < perlExpr.Length && perlExpr[i] == '+')
                                {
                                    // TODO: we do not yet support possesive quantifiers
                                    //       so we just skip the attribute it and pray
                                    //       nobody will ever realize :-)
                                    i++;
                                }
                                else
								{
									// add question mark to make the quantifier lazy:
									if (result.Length != 0 && result[result.Length - 1] == '?')
									{
										// HACK: Due to the issue in .NET regex we can't use "??" because it isn't interpreted correctly!!
										// (for example "^(ab)??$" matches with "abab", but it shouldn't!!)
									}
									else
										result.Append('?');
								}

								last_quantifier = true;
								continue;
							}
						}

						last_quantifier = false;

						if (ch == '$' && (opt & PerlRegexOptions.DollarMatchesEndOfStringOnly) != 0)
						{
							// replaces '$' with '\z': 
							result.Append(@"\z");
							break;
						}

						if (ch == '[')
							state = 1;

                        Append(result, ch);
                        break;

					case 1: // first character of character class
						if (escaped)
						{
							result.Append('\\');
                            Append(result, ch);
                            range_from_character = ch;
							state = 2;
							break;
						}

						// special characters:
						if (ch == '^' || ch == ']' || ch == '-')
						{
							Append(result, ch);
						}
						else
						{
							// other characters are not consumed here, for example [[:space:]abc] will not match if the first
							// [ is appended here.
							state = 2;
							goto case 2;
						}
						break;

					case 2: // inside of character class
                        if (escaped)
						{
                            result.Append('\\');
                            Append(result, ch);
                            range_from_character = ch;
							leaving_range = false;
							break;
						}

						if (ch == '-' && !leaving_range)
						{
							state = 3;
							break;
						}
						leaving_range = false;

						// posix character classes
						Match match = posixCharClasses.Match(perlExpr.Substring(i), 0);
						if (match.Success)
						{
							string chars = PosixRegExp.BracketExpression.CountCharacterClass(match.Groups[2].Value);
							if (chars == null)
								throw new ArgumentException(/*TODO*/ String.Format("Unknown character class '{0}'", match.Groups[2].Value));

							if (match.Groups[1].Value.Length > 0)
								throw new ArgumentException(/*TODO*/ "POSIX character classes negation not supported.");

							result.Append(chars);
                            range_from_character = -1;  // -1 means, it is not rangable :)
							i += match.Length - 1; // +1 is added just behind the switch
							break;
						}

                        if (ch == ']')
                        {
                            addedSurrogate2Ranges = null;   // drop the cache of ranges
                            state = 0;
                        }
                        
                        // append <ch>
                        range_from_character = ch;

						if (ch == '-')
                            result.Append("\\x2d");
                        else
                            AppendEscaped(result, ch);

                        break;

					case 3: // range previous character was '-'
						if (!escaped && ch == ']')
						{
                            if (range_from_character > char.MaxValue)
                                throw new ArgumentException("Cannot range from an UTF-32 character to unknown.");
                            
							result.Append("-]");
                            addedSurrogate2Ranges = null;   // drop the cache of ranges
                            state = 0;
                            break;
						}

                        //string range;
                        //int error;
                        //if (!PosixRegExp.BracketExpression.CountRange(result[result.Length - 1], ch, out range, out error))
                        //{
                        //    if ((error != 1) || (!CountUnicodeRange(result[result.Length - 1], ch, out range)))
                        //    {
                        //        Debug.Assert(error == 2);
                        //        throw new ArgumentException(LibResources.GetString("range_first_character_greater"));
                        //    }
                        //}
                        //PosixRegExp.BracketExpression.EscapeBracketExpressionSpecialChars(result, range); // left boundary is duplicated, but doesn't matter...

                        if (addedSurrogate2Ranges == null)
                            addedSurrogate2Ranges = new HashSet<uint>();    // initialize the cache of already added character ranges, invalidated at the end of character class

                        if (ch != range_from_character)
                        {
                            // <from>-<ch>:

                            // 1. <utf16>-<utf16>
                            // 2. <utf16>-<utf32>
                            // 3. <utf32>-<utf32>
                            if (range_from_character <= char.MaxValue)
                            {
                                if (ch <= char.MaxValue)
                                {
																	//symbol order can be different, not testet with other modes
																	var seqBreak = false;
	                                byte from = 0;
	                                byte to = 0;
	                                if (encoding.IsSingleByte)
	                                {
		                                var bytes = encoding.GetBytes(new char[] {(char) range_from_character});
		                                from = bytes[0];
		                                bytes = encoding.GetBytes(new char[] {(char)ch});
		                                to = bytes[0];
		                                var lastChar = range_from_character;
		                                for (int j = from + 1; j <= to; j++)
		                                {
			                                var chars = encoding.GetChars(new[] {(byte)j});
			                                if (chars[0] - lastChar != 1)
			                                {
				                                seqBreak = true;
				                                break;
			                                }
			                                lastChar = chars[0];
		                                }
	                                }

	                                // 1.
	                                if (!seqBreak)
	                                {
		                                result.Append('-');
		                                AppendEscaped(result, ch);
	                                }
	                                else
	                                {
																		for (byte b = (byte)(from + 1); b <= to; b++)
																		{
																			var chars = encoding.GetChars(new[] { b });
																			AppendEscaped(result, chars[0]);
																		}
	                                }
                                }
                                else
                                {
                                    // 2.
                                    result.Append('-');
                                    AppendEscaped(result, char.MaxValue);
                                    
                                    // count <char.MaxValue+1>-<ch>
                                    CountUTF32Range(result, char.MaxValue + 1, ch, addedSurrogate2Ranges);
                                }
                            }
                            else
                            {
                                // 3. utf32 range
                                result.Length -= 2;
                                CountUTF32Range(result, range_from_character, ch, addedSurrogate2Ranges);
                            }
                        }

						state = 2;
						leaving_range = true;
                        range_from_character = -1;
						break;
				}

				i++;
			}

            return ConvertPossesiveToAtomicGroup(result);
		}
示例#2
0
		private static void ParseRegexOptions(StringUtils.UniformWrapper pattern, int start,
		  out RegexOptions dotNetOptions, out PerlRegexOptions extraOptions)
		{
			dotNetOptions = RegexOptions.None;
			extraOptions = PerlRegexOptions.None;

			for (int i = start; i < pattern.Length; i++)
			{
				char option = pattern[i];

				switch (option)
				{
					case 'i': // PCRE_CASELESS
						dotNetOptions |= RegexOptions.IgnoreCase;
						break;

					case 'm': // PCRE_MULTILINE
						dotNetOptions |= RegexOptions.Multiline;
						break;

					case 's': // PCRE_DOTALL
						dotNetOptions |= RegexOptions.Singleline;
						break;

					case 'x': // PCRE_EXTENDED
						dotNetOptions |= RegexOptions.IgnorePatternWhitespace;
						break;

					case 'e': // evaluate as PHP code
						extraOptions |= PerlRegexOptions.Evaluate;
						break;

					case 'A': // PCRE_ANCHORED
						extraOptions |= PerlRegexOptions.Anchored;
						break;

					case 'D': // PCRE_DOLLAR_ENDONLY
						extraOptions |= PerlRegexOptions.DollarMatchesEndOfStringOnly;
						break;

					case 'S': // spend more time studying the pattern - ignore
						break;

					case 'U': // PCRE_UNGREEDY
						extraOptions |= PerlRegexOptions.Ungreedy;
						break;

					case 'u': // PCRE_UTF8
						extraOptions |= PerlRegexOptions.UTF8;
						break;

					case 'X': // PCRE_EXTRA
						PhpException.Throw(PhpError.Warning, LibResources.GetString("modifier_not_supported", option));
						break;

					default:
						PhpException.Throw(PhpError.Notice, LibResources.GetString("modifier_unknown", option));
						break;
				}
			}

			// inconsistent options check:
			if
			(
			  (dotNetOptions & RegexOptions.Multiline) != 0 &&
			  (extraOptions & PerlRegexOptions.DollarMatchesEndOfStringOnly) != 0
			)
			{
				PhpException.Throw(PhpError.Notice, LibResources.GetString("modifiers_inconsistent", 'D', 'm'));
			}
		}
		/// <summary>
		/// Converts Perl match expression (only, without delimiters, options etc.) to .NET regular expression.
		/// </summary>
		/// <param name="perlExpr">Perl regular expression to convert.</param>
		/// <param name="opt">Regexp options - some of them must be processed by changes in match string.</param>
		/// <param name="encoding">Encoding.</param>
		/// <returns>Resulting .NET regular expression.</returns>
		private static string ConvertRegex(string perlExpr, PerlRegexOptions opt, Encoding/*!*/ encoding)
		{
			// Ranges in bracket expressions should be replaced with appropriate characters

			// assume no conversion will be performed, create string builder with exact length. Only in
			// case there is a range StringBuilder would be prolonged, +1 for Anchored
			StringBuilder result = new StringBuilder(perlExpr.Length + 1);

			// Anchored means that the string should match only at the start of the string, add '^'
			// at the beginning if there is no one
			if ((opt & PerlRegexOptions.Anchored) != 0 && (perlExpr.Length == 0 || perlExpr[0] != '^'))
				result.Append('^');

			// set to true after a quantifier is matched, if there is second quantifier just behind the
			// first it is an error
			bool last_quantifier = false;

			// 4 means we're switching from 3 back to 2 - ie. "a-b-c" 
			// (we need to make a difference here because second "-" shouldn't be expanded)
			bool leaving_range = false;

			bool escaped = false;
			int state = 0;
			int group_state = 0;

			int i = 0;
			while (i < perlExpr.Length)
			{
				char ch = perlExpr[i];

				escaped = false;
				if (ch == '\\' && !ParseEscapeCode(encoding, perlExpr, ref i, ref ch, ref escaped))
				{
					i++;
					//Debug.Assert(i < perlExpr.Length, "Regex cannot end with backslash.");
					ch = perlExpr[i];

					// some characters (like '_') don't need to be escaped in .net
					if (ch == '_') escaped = false; else escaped = true;
				}

				switch (state)
				{
					case 0: // outside of character class
						if (escaped)
						{
							result.Append('\\');
							result.Append(ch);
							last_quantifier = false;
							break;
						}

						// In perl regexps, named groups are written like this: "(?P<name> ... )"
						// If the group is starting here, we need to skip the 'P' character (see state 4)
						switch (group_state)
						{
							case 0: group_state = (ch == '(') ? 1 : 0; break;
							case 1: group_state = (ch == '?') ? 2 : 0; break;
							case 2: if (ch == 'P') { i++; continue; } break;
						}

						if ((opt & PerlRegexOptions.Ungreedy) != 0)
						{
							// match quantifier ?,*,+,{n,m} at the position i:
							Match m = quantifiers.Match(perlExpr, i);

							// quantifier matched; quentifier '?' hasn't to be preceded by '(' - a grouping construct '(?'
							if (m.Success && (m.Value != "?" || i == 0 || perlExpr[i - 1] != '('))
							{
								// two quantifiers: 
								if (last_quantifier)
									throw new ArgumentException("regexp_duplicate_quantifier");

								// append quantifier:
								result.Append(perlExpr, i, m.Length);
								i += m.Length;

								if (i < perlExpr.Length && perlExpr[i] == '?')
								{
									// skip question mark to make the quantifier greedy:
									i++;
								}
								else if (i < perlExpr.Length && perlExpr[i] == '+')
								{
									// TODO: we do not yet support possesive quantifiers
									//       so we just skip the attribute it and pray
									//       nobody will ever realize :-)
									i++;
								}
								else
								{
									// add question mark to make the quantifier lazy:
									if (result.Length != 0 && result[result.Length - 1] == '?')
									{
										// HACK: Due to the issue in .NET regex we can't use "??" because it isn't interpreted correctly!!
										// (for example "^(ab)??$" matches with "abab", but it shouldn't!!)
									}
									else
										result.Append('?');
								}

								last_quantifier = true;
								continue;
							}
						}

						last_quantifier = false;

						if (ch == '$' && (opt & PerlRegexOptions.DollarMatchesEndOfStringOnly) != 0)
						{
							// replaces '$' with '\z': 
							result.Append(@"\z");
							break;
						}

						if (ch == '[')
							state = 1;

						result.Append(ch);
						break;

					case 1: // first character of character class
						if (escaped)
						{
							result.Append('\\');
							result.Append(ch);
							state = 2;
							break;
						}

						// special characters:
						if (ch == '^' || ch == ']' || ch == '-')
						{
							result.Append(ch);
						}
						else
						{
							// other characters are not consumed here, for example [[:space:]abc] will not match if the first
							// [ is appended here.
							state = 2;
							goto case 2;
						}
						break;

					case 2: // inside of character class
						if (escaped)
						{
							result.Append('\\');
							result.Append(ch);
							leaving_range = false;
							break;
						}

						if (ch == '-' && !leaving_range)
						{
							state = 3;
							break;
						}
						leaving_range = false;

						// posix character classes
						Match match = posixCharClasses.Match(perlExpr.Substring(i), 0);
						if (match.Success)
						{
							string chars = CountCharacterClass(match.Groups[2].Value);
							if (chars == null)
								throw new ArgumentException(/*TODO*/ String.Format("Unknown character class '{0}'", match.Groups[2].Value));

							if (match.Groups[1].Value.Length > 0)
								throw new ArgumentException(/*TODO*/ "POSIX character classes negation not supported.");

							result.Append(chars);
							i += match.Length - 1; // +1 is added just behind the switch
							break;
						}

						if (ch == ']')
							state = 0;
						if (ch == '-')
							result.Append("\\x2d");
						else
							result.Append(ch);
						break;

					case 3: // range previous character was '-'
						if (!escaped && ch == ']')
						{
							result.Append("-]");
							state = 0;
							break;
						}

						string range;
						int error;
						if (!CountRange(result[result.Length - 1], ch, out range, out error, encoding))
						{
							if ((error != 1) || (!CountUnicodeRange(result[result.Length - 1], ch, out range)))
							{
								//Debug.Assert(error == 2);
								throw new ArgumentException("range_first_character_greater");
							}
						}
						result.Append(EscapeBracketExpressionSpecialChars(range)); // left boundary is duplicated, but doesn't matter...
						state = 2;
						leaving_range = true;
						break;
				}

				i++;
			}

			return result.ToString();
		}
		private static void ParseRegexOptions(StringBuilder pattern, int start,
		                                      out RegexOptions dotNetOptions, out PerlRegexOptions extraOptions)
		{
			dotNetOptions = RegexOptions.None;
			extraOptions = PerlRegexOptions.None;

			for (int i = start; i < pattern.Length; i++)
			{
				char option = pattern[i];

				switch (option)
				{
					case 'i': // PCRE_CASELESS
						dotNetOptions |= RegexOptions.IgnoreCase;
						break;

					case 'm': // PCRE_MULTILINE
						dotNetOptions |= RegexOptions.Multiline;
						break;

					case 's': // PCRE_DOTALL
						dotNetOptions |= RegexOptions.Singleline;
						break;

					case 'x': // PCRE_EXTENDED
						dotNetOptions |= RegexOptions.IgnorePatternWhitespace;
						break;

					case 'e': // evaluate as PHP code
						extraOptions |= PerlRegexOptions.Evaluate;
						break;

					case 'A': // PCRE_ANCHORED
						extraOptions |= PerlRegexOptions.Anchored;
						break;

					case 'D': // PCRE_DOLLAR_ENDONLY
						extraOptions |= PerlRegexOptions.DollarMatchesEndOfStringOnly;
						break;

					case 'S': // spend more time studythe pattern - ignore
						break;

					case 'U': // PCRE_UNGREEDY
						extraOptions |= PerlRegexOptions.Ungreedy;
						break;

					case 'u': // PCRE_UTF8
						extraOptions |= PerlRegexOptions.Utf8;
						break;

					case 'X': // PCRE_EXTRA
						throw new Exception("Modifier not supported");
						break;

					default:
						throw new Exception("Modifier unknown");
						break;
				}
			}

			// inconsistent options check:
			if
				(
				(dotNetOptions & RegexOptions.Multiline) != 0 &&
				(extraOptions & PerlRegexOptions.DollarMatchesEndOfStringOnly) != 0
				)
			{
				throw new Exception("Modifier inconsistent");
			}
		}
示例#5
0
        /// <summary>
        /// Converts Perl match expression (only, without delimiters, options etc.) to .NET regular expression.
        /// </summary>
        /// <param name="perlExpr">Perl regular expression to convert.</param>
        /// <param name="opt">Regexp options - some of them must be processed by changes in match string.</param>
        /// <param name="encoding">Encoding.</param>
        /// <returns>Resulting .NET regular expression.</returns>
        private static string ConvertRegex(string perlExpr, PerlRegexOptions opt, Encoding /*!*/ encoding)
        {
            // Ranges in bracket expressions should be replaced with appropriate characters

            // assume no conversion will be performed, create string builder with exact length. Only in
            // case there is a range StringBuilder would be prolonged, +1 for Anchored
            StringBuilder result = new StringBuilder(perlExpr.Length + 1);

            // Anchored means that the string should match only at the start of the string, add '^'
            // at the beginning if there is no one
            if ((opt & PerlRegexOptions.Anchored) != 0 && (perlExpr.Length == 0 || perlExpr[0] != '^'))
            {
                result.Append('^');
            }

            // set to true after a quantifier is matched, if there is second quantifier just behind the
            // first it is an error
            bool last_quantifier = false;

            // 4 means we're switching from 3 back to 2 - ie. "a-b-c"
            // (we need to make a difference here because second "-" shouldn't be expanded)
            bool leaving_range = false;

            bool escaped     = false;
            int  state       = 0;
            int  group_state = 0;

            int i = 0;

            while (i < perlExpr.Length)
            {
                char ch = perlExpr[i];

                escaped = false;
                if (ch == '\\' && !ParseEscapeCode(encoding, perlExpr, ref i, ref ch, ref escaped))
                {
                    i++;
                    //Debug.Assert(i < perlExpr.Length, "Regex cannot end with backslash.");
                    ch = perlExpr[i];

                    // some characters (like '_') don't need to be escaped in .net
                    if (ch == '_')
                    {
                        escaped = false;
                    }
                    else
                    {
                        escaped = true;
                    }
                }

                switch (state)
                {
                case 0:     // outside of character class
                    if (escaped)
                    {
                        result.Append('\\');
                        result.Append(ch);
                        last_quantifier = false;
                        break;
                    }

                    // In perl regexps, named groups are written like this: "(?P<name> ... )"
                    // If the group is starting here, we need to skip the 'P' character (see state 4)
                    switch (group_state)
                    {
                    case 0: group_state = (ch == '(') ? 1 : 0; break;

                    case 1: group_state = (ch == '?') ? 2 : 0; break;

                    case 2: if (ch == 'P')
                        {
                            i++; continue;
                        }
                        break;
                    }

                    if ((opt & PerlRegexOptions.Ungreedy) != 0)
                    {
                        // match quantifier ?,*,+,{n,m} at the position i:
                        Match m = quantifiers.Match(perlExpr, i);

                        // quantifier matched; quentifier '?' hasn't to be preceded by '(' - a grouping construct '(?'
                        if (m.Success && (m.Value != "?" || i == 0 || perlExpr[i - 1] != '('))
                        {
                            // two quantifiers:
                            if (last_quantifier)
                            {
                                throw new ArgumentException("regexp_duplicate_quantifier");
                            }

                            // append quantifier:
                            result.Append(perlExpr, i, m.Length);
                            i += m.Length;

                            if (i < perlExpr.Length && perlExpr[i] == '?')
                            {
                                // skip question mark to make the quantifier greedy:
                                i++;
                            }
                            else if (i < perlExpr.Length && perlExpr[i] == '+')
                            {
                                // TODO: we do not yet support possesive quantifiers
                                //       so we just skip the attribute it and pray
                                //       nobody will ever realize :-)
                                i++;
                            }
                            else
                            {
                                // add question mark to make the quantifier lazy:
                                if (result.Length != 0 && result[result.Length - 1] == '?')
                                {
                                    // HACK: Due to the issue in .NET regex we can't use "??" because it isn't interpreted correctly!!
                                    // (for example "^(ab)??$" matches with "abab", but it shouldn't!!)
                                }
                                else
                                {
                                    result.Append('?');
                                }
                            }

                            last_quantifier = true;
                            continue;
                        }
                    }

                    last_quantifier = false;

                    if (ch == '$' && (opt & PerlRegexOptions.DollarMatchesEndOfStringOnly) != 0)
                    {
                        // replaces '$' with '\z':
                        result.Append(@"\z");
                        break;
                    }

                    if (ch == '[')
                    {
                        state = 1;
                    }

                    result.Append(ch);
                    break;

                case 1:     // first character of character class
                    if (escaped)
                    {
                        result.Append('\\');
                        result.Append(ch);
                        state = 2;
                        break;
                    }

                    // special characters:
                    if (ch == '^' || ch == ']' || ch == '-')
                    {
                        result.Append(ch);
                    }
                    else
                    {
                        // other characters are not consumed here, for example [[:space:]abc] will not match if the first
                        // [ is appended here.
                        state = 2;
                        goto case 2;
                    }
                    break;

                case 2:     // inside of character class
                    if (escaped)
                    {
                        result.Append('\\');
                        result.Append(ch);
                        leaving_range = false;
                        break;
                    }

                    if (ch == '-' && !leaving_range)
                    {
                        state = 3;
                        break;
                    }
                    leaving_range = false;

                    // posix character classes
                    Match match = posixCharClasses.Match(perlExpr.Substring(i), 0);
                    if (match.Success)
                    {
                        string chars = CountCharacterClass(match.Groups[2].Value);
                        if (chars == null)
                        {
                            throw new ArgumentException(/*TODO*/ String.Format("Unknown character class '{0}'", match.Groups[2].Value));
                        }

                        if (match.Groups[1].Value.Length > 0)
                        {
                            throw new ArgumentException(/*TODO*/ "POSIX character classes negation not supported.");
                        }

                        result.Append(chars);
                        i += match.Length - 1;     // +1 is added just behind the switch
                        break;
                    }

                    if (ch == ']')
                    {
                        state = 0;
                    }
                    if (ch == '-')
                    {
                        result.Append("\\x2d");
                    }
                    else
                    {
                        result.Append(ch);
                    }
                    break;

                case 3:     // range previous character was '-'
                    if (!escaped && ch == ']')
                    {
                        result.Append("-]");
                        state = 0;
                        break;
                    }

                    string range;
                    int    error;
                    if (!CountRange(result[result.Length - 1], ch, out range, out error, encoding))
                    {
                        if ((error != 1) || (!CountUnicodeRange(result[result.Length - 1], ch, out range)))
                        {
                            //Debug.Assert(error == 2);
                            throw new ArgumentException("range_first_character_greater");
                        }
                    }
                    result.Append(EscapeBracketExpressionSpecialChars(range));     // left boundary is duplicated, but doesn't matter...
                    state         = 2;
                    leaving_range = true;
                    break;
                }

                i++;
            }

            return(result.ToString());
        }
示例#6
0
        private static void ParseRegexOptions(StringBuilder pattern, int start,
                                              out RegexOptions dotNetOptions, out PerlRegexOptions extraOptions)
        {
            dotNetOptions = RegexOptions.None;
            extraOptions  = PerlRegexOptions.None;

            for (int i = start; i < pattern.Length; i++)
            {
                char option = pattern[i];

                switch (option)
                {
                case 'i':     // PCRE_CASELESS
                    dotNetOptions |= RegexOptions.IgnoreCase;
                    break;

                case 'm':     // PCRE_MULTILINE
                    dotNetOptions |= RegexOptions.Multiline;
                    break;

                case 's':     // PCRE_DOTALL
                    dotNetOptions |= RegexOptions.Singleline;
                    break;

                case 'x':     // PCRE_EXTENDED
                    dotNetOptions |= RegexOptions.IgnorePatternWhitespace;
                    break;

                case 'e':     // evaluate as PHP code
                    extraOptions |= PerlRegexOptions.Evaluate;
                    break;

                case 'A':     // PCRE_ANCHORED
                    extraOptions |= PerlRegexOptions.Anchored;
                    break;

                case 'D':     // PCRE_DOLLAR_ENDONLY
                    extraOptions |= PerlRegexOptions.DollarMatchesEndOfStringOnly;
                    break;

                case 'S':     // spend more time studythe pattern - ignore
                    break;

                case 'U':     // PCRE_UNGREEDY
                    extraOptions |= PerlRegexOptions.Ungreedy;
                    break;

                case 'u':     // PCRE_UTF8
                    extraOptions |= PerlRegexOptions.UTF8;
                    break;

                    /*
                     * case 'X': // PCRE_EXTRA
                     *  throw new Exception("Modifier not supported");
                     *
                     *
                     * default:
                     *  throw new Exception("Modifier unknown");
                     */
                }
            }

            // inconsistent options check:
            if
            (
                (dotNetOptions & RegexOptions.Multiline) != 0 &&
                (extraOptions & PerlRegexOptions.DollarMatchesEndOfStringOnly) != 0
            )
            {
                throw new Exception("Modifier inconsistent");
            }
        }