public static Match <Lexem> NextOrThrow(IMatchIterator <Lexem> iterator) { if (!iterator.TryMatchNext(out var match)) { throw CreateException("unrecognized character", iterator.Position); } return(match); }
public static Node MatchSequence(IMatchIterator <Lexem> iterator, Match <Lexem> match, bool atTopLevel) { var nodes = new List <Node>(); while (true) { // Match literal character or character class Node node; switch (match.Value.Type) { case LexemType.ClassBegin: node = MatchClass(iterator, NextOrThrow(iterator)); break; case LexemType.End: if (atTopLevel) { return(Node.CreateSequence(nodes)); } throw CreateException("unfinished sequence", iterator.Position); case LexemType.Escape: node = Node.CreateCharacter(match.Value.Replacement); break; case LexemType.Wildcard: node = Wildcard; break; case LexemType.ZeroOrMore: node = Node.CreateRepeat(Wildcard, 0, -1); break; default: node = Node.CreateCharacter(match.Capture[0]); break; } match = NextOrThrow(iterator); nodes.Add(node); } }
private static (int min, int max) MatchRepeat(IMatchIterator <Lexem> iterator, Match <Lexem> match) { var buffer = new StringBuilder(); while (match.Value.Type == LexemType.Digit) { buffer.Append(match.Capture[0]); match = NextOrThrow(iterator); } int max; var min = buffer.Length > 0 ? int.Parse(buffer.ToString()) : 0; if (match.Value.Type == LexemType.Comma) { buffer.Clear(); match = NextOrThrow(iterator); while (match.Value.Type == LexemType.Digit) { buffer.Append(match.Capture[0]); match = NextOrThrow(iterator); } max = buffer.Length > 0 ? int.Parse(buffer.ToString()) : -1; if (max >= 0 && max < min) { throw CreateException("invalid repeat sequence", iterator.Position); } } else { max = min; } if (match.Value.Type != LexemType.RepeatEnd) { throw CreateException("expected end of repeat specifier", iterator.Position); } return(min, max); }
public static (Node, Match <Lexem>) MatchAlternative(IMatchIterator <Lexem> iterator, Match <Lexem> match, bool atTopLevel) { var alternativeNodes = new List <Node>(); while (true) { var(sequenceNodes, nextMatch) = MatchSequence(iterator, match, atTopLevel); alternativeNodes.Add(sequenceNodes); if (nextMatch.Value.Type != LexemType.Alternative) { return(Node.CreateAlternative(alternativeNodes), nextMatch); } match = NextOrThrow(iterator); } }
public static (Node, Match <Lexem>) MatchSequence(IMatchIterator <Lexem> iterator, Match <Lexem> match, bool atTopLevel) { var sequenceNodes = new List <Node>(); while (true) { // Match literal character or character class Match <Lexem> nextMatch; Node node; switch (match.Value.Type) { case LexemType.Alternative: return(Node.CreateSequence(sequenceNodes), match); case LexemType.End: if (!atTopLevel) { throw CreateException("unfinished parenthesis", iterator.Position); } return(Node.CreateSequence(sequenceNodes), match); case LexemType.ClassBegin: node = MatchClass(iterator, NextOrThrow(iterator)); nextMatch = NextOrThrow(iterator); break; case LexemType.Escape: node = Node.CreateCharacter(match.Value.Replacement); nextMatch = NextOrThrow(iterator); break; case LexemType.SequenceBegin: var(alternativeNode, alternativeNextMatch) = MatchAlternative(iterator, NextOrThrow(iterator), false); node = alternativeNode; nextMatch = alternativeNextMatch; break; case LexemType.SequenceEnd: if (!atTopLevel) { return(Node.CreateSequence(sequenceNodes), NextOrThrow(iterator)); } node = Node.CreateCharacter(match.Capture[0]); nextMatch = NextOrThrow(iterator); break; case LexemType.Wildcard: node = Node.CreateCharacter(new[] { new NodeRange(char.MinValue, char.MaxValue) }); nextMatch = NextOrThrow(iterator); break; default: node = Node.CreateCharacter(match.Capture[0]); nextMatch = NextOrThrow(iterator); break; } // Match repeat specifier if any int max; int min; switch (nextMatch.Value.Type) { case LexemType.OneOrMore: (min, max) = (1, -1); match = NextOrThrow(iterator); break; case LexemType.RepeatBegin: (min, max) = MatchRepeat(iterator, NextOrThrow(iterator)); match = NextOrThrow(iterator); break; case LexemType.ZeroOrMore: (min, max) = (0, -1); match = NextOrThrow(iterator); break; case LexemType.ZeroOrOne: (min, max) = (0, 1); match = NextOrThrow(iterator); break; default: (min, max) = (1, 1); match = nextMatch; break; } sequenceNodes.Add(Node.CreateRepeat(node, min, max)); } }
private static Node MatchClass(IMatchIterator <Lexem> iterator, Match <Lexem> match) { var ranges = new List <NodeRange>(); // Allow first character of a class to be special "negate class" character if (match.Value.Type == LexemType.Negate) { throw new NotImplementedException("negated character classes are not supported yet"); } // Allow first (or post-negate) character of a class to be literal "end of class" character if (match.Value.Type == LexemType.ClassEnd) { ranges.Add(new NodeRange(match.Capture[0], match.Capture[0])); match = NextOrThrow(iterator); } while (true) { // Match next character, which may later be considered as the // beginning character of range char begin; char end; switch (match.Value.Type) { case LexemType.End: throw CreateException("unfinished characters class", iterator.Position); case LexemType.ClassEnd: return(Node.CreateCharacter(ranges)); case LexemType.Escape: begin = match.Value.Replacement; break; default: begin = match.Capture[0]; break; } match = NextOrThrow(iterator); // If next lexem defines a range (e.g. "a-z"), read next one to // get end character for this range before registering it if (match.Value.Type == LexemType.Range) { match = NextOrThrow(iterator); switch (match.Value.Type) { case LexemType.End: throw CreateException("unfinished characters class", iterator.Position); case LexemType.Escape: end = match.Value.Replacement; break; default: end = match.Capture[0]; break; } match = NextOrThrow(iterator); } // Otherwise register transition from a single character else { end = begin; } ranges.Add(new NodeRange(begin, end)); } }
/// <Summary> /// Compile regular pattern into graph of non-deterministic states leading to given value. /// </Summary> protected abstract Node CreateGraph(IMatchIterator <Lexem> iterator);