Beispiel #1
0
		private Expression ParseSpecial (RegexOptions options) {
			int p = ptr;
			bool ecma = IsECMAScript (options);
			Expression expr = null;
			
			switch (pattern[ptr ++]) {

			// categories

			case 'd':
				expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
				break;
				
			case 'w':
				expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
				break;
				
			case 's':
				expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
				break;
				
			case 'p':
				// this is odd - ECMAScript isn't supposed to support Unicode,
				// yet \p{..} compiles and runs under the MS implementation
				// identically to canonical mode. That's why I'm ignoring the
				// value of ecma here.
			
				expr = new CharacterClass (ParseUnicodeCategory (), false);
				break;
				
			case 'D':
				expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
				break;
				
			case 'W':
				expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
				break;
				
			case 'S':
				expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
				break;
				
			case 'P':
				expr = new CharacterClass (ParseUnicodeCategory (), true);
				break;

			// positions

			case 'A': expr = new PositionAssertion (Position.StartOfString); break;
			case 'Z': expr = new PositionAssertion (Position.End); break;
			case 'z': expr = new PositionAssertion (Position.EndOfString); break;
			case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
			case 'b': expr = new PositionAssertion (Position.Boundary); break;
			case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
			
			// references

			case '1': case '2': case '3': case '4': case '5':
			case '6': case '7': case '8': case '9': {
				ptr --;
				int n = ParseNumber (10, 1, 0);
				if (n < 0) {
					ptr = p;
					return null;
				}

				// FIXME test if number is within number of assigned groups
				// this may present a problem for right-to-left matching

				Reference reference = new BackslashNumber (IsIgnoreCase (options), ecma);
				refs.Add (reference, n.ToString ());
				expr = reference;
				break;
			}

			case 'k': {
				char delim = pattern[ptr ++];
				if (delim == '<')
					delim = '>';
				else if (delim != '\'')
					throw NewParseException ("Malformed \\k<...> named backreference.");

				string name = ParseName ();
				if (name == null || pattern[ptr] != delim)
					throw NewParseException ("Malformed \\k<...> named backreference.");

				++ ptr;
				Reference reference = new Reference (IsIgnoreCase (options));
				refs.Add (reference, name);
				expr = reference;
				break;
			}

			default:
				expr = null;
				break;
			}

			if (expr == null)
				ptr = p;

			return expr;
		}
Beispiel #2
0
		// private methods

		private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
			bool is_top_level = group is RegularExpression;
		
			Alternation alternation = null;
			string literal = null;

			Group current = new Group ();
			Expression expr = null;
			bool closed = false;

			while (true) {
				ConsumeWhitespace (IsIgnorePatternWhitespace (options));
				if (ptr >= pattern.Length)
					break;
				
				// (1) Parse for Expressions
			
				char ch = pattern[ptr ++];
				
				switch (ch) {
				case '^': {
					Position pos =
						IsMultiline (options) ? Position.StartOfLine : Position.Start;
					expr = new PositionAssertion (pos);
					break;
				}

				case '$': {
					Position pos =
						IsMultiline (options) ? Position.EndOfLine : Position.End;
					expr = new PositionAssertion (pos);
					break;
				}

				case '.': {
					Category cat =
						IsSingleline (options) ? Category.AnySingleline : Category.Any;
					expr = new CharacterClass (cat, false);
					break;
				}

				case '\\': {
					int c = ParseEscape (false);
					if (c >= 0)
						ch = (char)c;
					else {
						expr = ParseSpecial (options);

						if (expr == null)
							ch = pattern[ptr ++];		// default escape
					}
					break;
				}

				case '[': {
					expr = ParseCharacterClass (options);
					break;
				}

				case '(': {
					bool ignore = IsIgnoreCase (options);
					expr = ParseGroupingConstruct (ref options);
					if (expr == null) {
						if (literal != null && IsIgnoreCase (options) != ignore) {
							current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
							literal = null;
						}

						continue;
					}
					break;
				}

				case ')': {
					closed = true;
					goto EndOfGroup;
				}

				case '|': {
					if (literal != null) {
						current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
						literal = null;
					}

					if (assertion != null) {
						if (assertion.TrueExpression == null)
							assertion.TrueExpression = current;
						else if (assertion.FalseExpression == null)
							assertion.FalseExpression = current;
						else
							throw NewParseException ("Too many | in (?()|).");
					}
					else {
						if (alternation == null)
							alternation = new Alternation ();

						alternation.AddAlternative (current);
					}

					current = new Group ();
					continue;
				}

				case '*': case '+': case '?': {
					throw NewParseException ("Bad quantifier.");
				}

				default: 
					break;		// literal character
				}

				ConsumeWhitespace (IsIgnorePatternWhitespace (options));
				
				// (2) Check for Repetitions
				
				if (ptr < pattern.Length) {
					char k = pattern[ptr];
					int min = 0, max = 0;
					bool lazy = false;
					bool haveRep = false;


					if (k == '?' || k == '*' || k == '+') {
						++ ptr;
						haveRep = true;

						switch (k) {
						case '?': min = 0; max = 1; break;
						case '*': min = 0; max = 0x7fffffff; break;
						case '+': min = 1; max = 0x7fffffff; break;
						}
					} else if (k == '{' && ptr + 1 < pattern.Length) {
						int saved_ptr = ptr;
						++ptr;
						haveRep = ParseRepetitionBounds (out min, out max, options);
						if (!haveRep)
							ptr = saved_ptr;
					}

					if (haveRep) {
						ConsumeWhitespace (IsIgnorePatternWhitespace (options));
						if (ptr < pattern.Length && pattern[ptr] == '?') {
							++ ptr;
							lazy = true;
						}

						//It doesn't make sense to assert a given position more than once.
						bool ignore_repetition = false;
						if (expr is PositionAssertion) {
							ignore_repetition = min > 0 && !lazy;
							max = 1;
						}

						if (!ignore_repetition) {
							Repetition repetition = new Repetition (min, max, lazy);
	
							if (expr == null)
								repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
							else
								repetition.Expression = expr;
	
							expr = repetition;
						}
					}
				}

				// (3) Append Expression and/or Literal

				if (expr == null) {
					if (literal == null)
						literal = "";
					literal += ch;
				}
				else {
					if (literal != null) {
						current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
						literal = null;
					}

					current.AppendExpression (expr);
					expr = null;
				}

				if (is_top_level && ptr >= pattern.Length)
					goto EndOfGroup;
			}

		EndOfGroup:
			if (is_top_level && closed)
				throw NewParseException ("Too many )'s.");
			if (!is_top_level && !closed)
				throw NewParseException ("Not enough )'s.");
				
		
			// clean up literals and alternations

			if (literal != null)
				current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));

			if (assertion != null) {
				if (assertion.TrueExpression == null)
					assertion.TrueExpression = current;
				else
					assertion.FalseExpression = current;
				
				group.AppendExpression (assertion);
			}
			else if (alternation != null) {
				alternation.AddAlternative (current);
				group.AppendExpression (alternation);
			}
			else
				group.AppendExpression (current);
		}
Beispiel #3
0
		private Expression ParseCharacterClass (RegexOptions options) {
			bool negate = false;
			if (pattern[ptr] == '^') {
				negate = true;
				++ ptr;
			}
			
			bool ecma = IsECMAScript (options);
			CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));

			if (pattern[ptr] == ']') {
				cls.AddCharacter (']');
				++ ptr;
			}

			int c = -1;
			int last = -1;
			bool range = false;
			bool closed = false;
			while (ptr < pattern.Length) {
				c = pattern[ptr ++];

				if (c == ']') {
					closed = true;
					break;
				}

				if (c == '-' && last >= 0 && !range) {
					range = true;
					continue;
				}

				if (c == '\\') {
					c = ParseEscape (true);
					if (c >= 0)
						goto char_recognized;

					// didn't recognize escape
					c = pattern [ptr ++];
					switch (c) {
					case 'b':
						c = '\b';
						goto char_recognized;

					case 'd': case 'D':
						cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, c == 'D');
						break;
						
					case 'w': case 'W':
						cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, c == 'W');
						break;
						
					case 's': case 'S':
						cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, c == 'S');
						break;
						
					case 'p': case 'P':
						cls.AddCategory (ParseUnicodeCategory (), c == 'P');	// ignore ecma
						break;

					default:		// add escaped character
						goto char_recognized;
					}

					// if the pattern looks like [a-\s] ...
					if (range)
						throw NewParseException ("character range cannot have category \\" + c);

					last = -1;
					continue;
				}

			char_recognized:
				if (range) {
					// if 'range' is true, we know that 'last >= 0'
					if (c < last)
						throw NewParseException ("[" + last + "-" + c + "] range in reverse order.");
					cls.AddRange ((char)last, (char)c);
					last = -1;
					range = false;
					continue;
				}

				cls.AddCharacter ((char)c);
				last = c;
			}

			if (!closed)
				throw NewParseException ("Unterminated [] set.");

			if (range)
				cls.AddCharacter ('-');

			return cls;
		}
		private Expression ParseCharacterClass (RegexOptions options) {
			bool negate, ecma;
			if (pattern[ptr] == '^') {
				negate = true;
				++ ptr;
			}
			else
				negate = false;
			
			ecma = IsECMAScript (options);
			CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));

			if (pattern[ptr] == ']') {
				cls.AddCharacter (']');
				++ ptr;
			}

			int c = -1;
			int last = -1;
			bool range = false;
			bool closed = false;
			while (ptr < pattern.Length) {
				c = pattern[ptr ++];

				if (c == ']') {
					closed = true;
					break;
				}
				
				if (c == '-') {
					range = true;
					continue;
				}

				if (c == '\\') {
					c = ParseEscape ();
					if (c < 0) {
						// didn't recognize escape

						c = pattern[ptr ++];
						switch (c) {
						case 'b': c = '\b'; break;

						case 'd':
							cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, false);
							last = -1;
							continue;
							
						case 'w':
							cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, false);
							last = -1;
							continue;
							
						case 's':
							cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
							last = -1;
							continue;
							
						case 'p':
							cls.AddCategory (ParseUnicodeCategory (), false);	// ignore ecma
							last = -1;
							continue;
							
						case 'D':
							cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, true);
							last = -1;
							continue;
							
						case 'W':
							cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, true);
							last = -1;
							continue;
							
						case 'S':
							cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
							last = -1;
							continue;
							
						case 'P':
							cls.AddCategory (ParseUnicodeCategory (), true);
							last = -1;
							continue;

						default: break;		// add escaped character
						}
					}
				}

				if (range) {
					if (c < last)
						throw NewParseException ("[x-y] range in reverse order.");

					if (last >=0 )
						cls.AddRange ((char)last, (char)c);
					else {
						cls.AddCharacter ((char)c);
						cls.AddCharacter ('-');
					}

					range = false;
					last = -1;
				}
				else {
					cls.AddCharacter ((char)c);
					last = c;
				}
			}

			if (!closed)
				throw NewParseException ("Unterminated [] set.");

			if (range)
				cls.AddCharacter ('-');

			return cls;
		}