public void QuantifierConstructorTest( ) { var goodPatterns = new[] { new { Pattern = "?", MinOccurrences = 0, MaxOccurrences = 1, IsLazy = false }, new { Pattern = "+", MinOccurrences = 1, MaxOccurrences = int.MaxValue, IsLazy = false }, new { Pattern = "+?", MinOccurrences = 1, MaxOccurrences = int.MaxValue, IsLazy = true }, new { Pattern = "*", MinOccurrences = 0, MaxOccurrences = int.MaxValue, IsLazy = false }, new { Pattern = "*?", MinOccurrences = 0, MaxOccurrences = int.MaxValue, IsLazy = true }, new { Pattern = "{1,2}", MinOccurrences = 1, MaxOccurrences = 2, IsLazy = false }, new { Pattern = "{,2}", MinOccurrences = 0, MaxOccurrences = 2, IsLazy = false }, new { Pattern = "{1,}", MinOccurrences = 1, MaxOccurrences = int.MaxValue, IsLazy = false }, new { Pattern = "{3}", MinOccurrences = 3, MaxOccurrences = 3, IsLazy = false }, }; foreach (var p in goodPatterns) { var node = new QuantifierToken(p.Pattern, null); Assert.AreEqual(node.MinOccurrences, p.MinOccurrences, string.Format("pattern: {0}, expected: {1}, actual: {2}", p.Pattern, p.MinOccurrences, node.MinOccurrences)); Assert.AreEqual(node.MaxOccurrences, p.MaxOccurrences, string.Format("pattern: {0}, expected: {1}, actual: {2}", p.Pattern, p.MaxOccurrences, node.MaxOccurrences)); Assert.AreEqual(node.IsLazy, p.IsLazy, string.Format("pattern: {0}, expected: {1}, actual: {2}", p.Pattern, p.IsLazy, node.IsLazy)); } }
public void TokenizeQuantifierTest( ) { var patterns = new[] { new { Pattern = "ba?", MinOccurrences = 0, MaxOccurrences = 1, Lazy = false }, new { Pattern = "ba+", MinOccurrences = 1, MaxOccurrences = int.MaxValue, Lazy = false }, new { Pattern = "ba+?", MinOccurrences = 1, MaxOccurrences = int.MaxValue, Lazy = true }, new { Pattern = "ba*", MinOccurrences = 0, MaxOccurrences = int.MaxValue, Lazy = false }, new { Pattern = "ba*?", MinOccurrences = 0, MaxOccurrences = int.MaxValue, Lazy = true }, new { Pattern = "ba{1,2}", MinOccurrences = 1, MaxOccurrences = 2, Lazy = false }, new { Pattern = "ba{,2}", MinOccurrences = 0, MaxOccurrences = 2, Lazy = false }, new { Pattern = "ba{1,}", MinOccurrences = 1, MaxOccurrences = int.MaxValue, Lazy = false }, }; foreach (var p in patterns) { Token root = Token.Tokenize(p.Pattern); Assert.IsInstanceOfType(root, typeof(GroupToken)); Token first = (root as GroupToken).Content[0]; Assert.AreEqual(Token.TokenType.Literal, first.Type); Assert.AreEqual("b", first.Text); Token result = (root as GroupToken).Content[1]; Assert.IsInstanceOfType(result, typeof(QuantifierToken)); QuantifierToken quantifier = result as QuantifierToken; Assert.AreEqual(Token.TokenType.Literal, quantifier.Target.Type); Assert.AreEqual("a", quantifier.Target.Text); Assert.AreEqual(quantifier.MinOccurrences, p.MinOccurrences, string.Format("pattern: {0}, expected: {1}, actual: {2}", p.Pattern, p.MinOccurrences, quantifier.MinOccurrences)); Assert.AreEqual(quantifier.MaxOccurrences, p.MaxOccurrences, string.Format("pattern: {0}, expected: {1}, actual: {2}", p.Pattern, p.MaxOccurrences, quantifier.MaxOccurrences)); Assert.AreEqual(quantifier.IsLazy, p.Lazy, string.Format("pattern: {0}, expected: {1}, actual: {2}", p.Pattern, p.Lazy, quantifier.IsLazy)); } }
/** Recursively transform a plain sequence of Token into a tree-like structure, * transforming them into the appropriate subclass. * * Token themselves as returned by findTokens() are not of much use because * they often refer to other tokens located near them, either before or after. * Also, regexes do have a structure, given by round brackets; in order to * recognise and preserve this structure some further processing is needed * to the stream of tokens. * * A tree is the perfect data type to represent the grammar and the structure * of a regex, where node types specify the meaning and node children represent * the "arguments" of each component. * * \param tokens A sequence of tokens. * \return An organised tree. */ private static GroupToken regroupTokens(IEnumerable <Token> tokens) { int groupCount = 0; var groups = new Stack <GroupToken>( ); var names = new HashSet <string>( ); // groups with the same name are not allowed bool insideLookaround = false; // nested lookarounds are not allowed Token target; var current = new GroupToken(string.Empty, groupCount++); groups.Push(current); foreach (Token t in tokens) { switch (t.Type) { case TokenType.GroupStart: var newGroup = new GroupToken(t.Text, groupCount++); if (newGroup.Name != null) { if (names.Contains(newGroup.Name)) { throw new ParsingException("multiple groups with the same name are not allowed"); } else { names.Add(newGroup.Name); } } current.Content.Add(newGroup); groups.Push(current); current = newGroup; break; case TokenType.GroupEnd: if (current.Index == lookaheadIndex) { insideLookaround = false; current = groups.Pop( ); break; } else if (current.Index == lookbehindIndex) { insideLookaround = false; var lookbehind = new LookbehindToken(current.Text, current); current = groups.Pop( ); current.Content.Add(lookbehind); break; } else { current = groups.Pop( ); break; } case TokenType.Quantifier: target = current.Content.Last( ); current.Content.Remove(target); var quantifier = new QuantifierToken(t.Text, target); current.Content.Add(quantifier); break; case TokenType.Lookahead: if (insideLookaround) { throw new ParsingException("nested lookarounds are not allowed"); } insideLookaround = true; /* mark the target as belonging to a lookahead, this will allow us to * * update insideLookaround when we find the corresponding GroupEnd */ var lookahead = new LookaheadToken(t.Text, new GroupToken { Index = lookaheadIndex }); current.Content.Add(lookahead); groups.Push(current); current = ( GroupToken )lookahead.Target; break; case TokenType.Lookbehind: if (insideLookaround) { throw new ParsingException("nested lookarounds are not allowed"); } insideLookaround = true; // the actual lookbehind will be created once all its content has been collected var group = new GroupToken(t.Text, lookbehindIndex); groups.Push(current); current = group; break; default: current.Content.Add(t); break; } } if (groups.Count > 1) { throw new ParsingException("unbalanced parenthesis"); } return(groups.Pop( )); }