// can get production from nonterminal and terminal as usual public ISymbol[] Get(Nonterminal var, Terminal term) { if (!table.ContainsKey(var)) return null; if (!table[var].ContainsKey(term)) return null; return table[var][term]; }
public Parser(CFG grammar, Terminal syncTerm) { this.grammar = grammar; this.table = grammar.CreateLL1ParseTable(); this.start = grammar.StartSymbol; this.syncTerm = syncTerm; if (table == null) throw new Exception("GRAMMAR NOT LL(1)"); }
// Add a production for some (nonterminal, terminal) pair to the table public void Add(Nonterminal var, Terminal term, ISymbol[] production) { if (!table.ContainsKey(var)) table[var] = new Dictionary<Terminal, ISymbol[]>(); table[var][term] = production; }
private MiniPL() { // Create NFA-type things for tokens using regular operations Regex reBlockCommentStart = Regex.Concat("/*"); Regex reBlockCommentEnd = Regex.Concat("*/"); Regex reLineComment = Regex.Concat("//").Concat(Regex.Not('\n').Star()); Regex reWhitespace = Regex.Union(" \t\r\n").Star().Union(reLineComment); Regex reString = Regex.Char('"').Concat(Regex.Char('\\').Concat(Regex.Any()).Union(Regex.Not('"', '\\')).Star()).Concat(Regex.Char('"')); Regex reBinaryOperator = Regex.Union("+-*/<=&"); Regex reUnaryOperator = Regex.Char('!'); Regex reKeyword = Regex.Union(Regex.Concat("var"), Regex.Concat("for"), Regex.Concat("end"), Regex.Concat("in"), Regex.Concat("do"), Regex.Concat("read"), Regex.Concat("print"), Regex.Concat("assert")); Regex reType = Regex.Union(Regex.Concat("bool"), Regex.Concat("int"), Regex.Concat("string")); Regex reParenRight = Regex.Char(')'), reParenLeft = Regex.Char('('), reColon = Regex.Char(':'), reSemicolon = Regex.Char(';'), reAssignment = Regex.Concat(":="), reDots = Regex.Concat(".."); Regex reIdentifier = Regex.Union(Regex.Range('A', 'Z'), Regex.Range('a', 'z')) .Concat(Regex.Union(Regex.Range('A', 'Z'), Regex.Range('a', 'z'), Regex.Range('0', '9'), Regex.Char('_')).Star()); Regex reInteger = Regex.Range('0', '9').Plus(); // Define token types tokenTypes["block_comment_start"] = new TokenType("block_comment_start", reBlockCommentStart); tokenTypes["block_comment_end"] = new TokenType("block_comment_end", reBlockCommentEnd); tokenTypes["int"] = new TokenType("int", reInteger); tokenTypes["whitespace"] = new TokenType("whitespace", reWhitespace, priority: TokenType.Priority.Whitespace); tokenTypes["string"] = new TokenType("string", reString); tokenTypes["binary_op"] = new TokenType("binary op", reBinaryOperator); tokenTypes["unary_op"] = new TokenType("unary op", reUnaryOperator); tokenTypes["keyword"] = new TokenType("keyword", reKeyword, priority: TokenType.Priority.Keyword); tokenTypes["type"] = new TokenType("type", reType, priority: TokenType.Priority.Keyword); tokenTypes["left_paren"] = new TokenType("left paren", reParenLeft); tokenTypes["right_paren"] = new TokenType("right paren", reParenRight); tokenTypes["colon"] = new TokenType("colon", reColon); tokenTypes["semicolon"] = new TokenType("semicolon", reSemicolon); tokenTypes["assignment"] = new TokenType("assignment", reAssignment); tokenTypes["dots"] = new TokenType("dots", reDots); tokenTypes["identifier"] = new TokenType("identifier", reIdentifier); // create combined automaton and scanner object TokenAutomaton automaton = TokenType.CombinedAutomaton(tokenTypes.Values.ToArray()); scanner = new Scanner(automaton, tokenTypes["block_comment_start"], tokenTypes["block_comment_end"]); // Define nonterminal variables of CFG nonterminals["program"] = new Nonterminal("PROG"); nonterminals["statements"] = new Nonterminal("STMTS"); nonterminals["statements_head"] = new Nonterminal("STMTS_HEAD"); nonterminals["statements_tail"] = new Nonterminal("STMTS_TAIL"); nonterminals["statement"] = new Nonterminal("STMT"); nonterminals["declaration"] = new Nonterminal("DECL"); nonterminals["declaration_assignment"] = new Nonterminal("DECL_ASSIGN"); nonterminals["expression"] = new Nonterminal("EXPR"); nonterminals["unary_operation"] = new Nonterminal("UNARY_OP"); nonterminals["binary_operation"] = new Nonterminal("BINARY_OP"); nonterminals["operand"] = new Nonterminal("OPND"); // Define terminal variables of CFG terminals["identifier"] = new Terminal(tokenTypes["identifier"]); terminals["assert"] = new Terminal("assert"); terminals["print"] = new Terminal("print"); terminals["read"] = new Terminal("read"); terminals["for"] = new Terminal("for"); terminals["in"] = new Terminal("in"); terminals["end"] = new Terminal("end"); terminals["do"] = new Terminal("do"); terminals["var"] = new Terminal("var"); terminals["type"] = new Terminal(tokenTypes["type"]); terminals["string"] = new Terminal(tokenTypes["string"]); terminals["int"] = new Terminal(tokenTypes["int"]); terminals[")"] = new Terminal(")"); terminals["("] = new Terminal("("); terminals[".."] = new Terminal(".."); terminals[":="] = new Terminal(":="); terminals[":"] = new Terminal(":"); terminals[";"] = new Terminal(";"); terminals["binary_operator"] = new Terminal(tokenTypes["binary_op"]); terminals["unary_operator"] = new Terminal(tokenTypes["unary_op"]); // Create the Mini-PL grammar grammar = new CFG(nonterminals["program"], terminals.Values, nonterminals.Values); // define production rules for the grammar grammar.AddProductionRule(nonterminals["program"], new ISymbol[] { nonterminals["statements"] }); grammar.AddProductionRule(nonterminals["statements"], new ISymbol[] { nonterminals["statements_head"], nonterminals["statements_tail"] }); grammar.AddProductionRule(nonterminals["statements_head"], new ISymbol[] { nonterminals["statement"], terminals[";"] }); grammar.AddProductionRule(nonterminals["statements_tail"], new ISymbol[] { nonterminals["statements_head"], nonterminals["statements_tail"] }); grammar.AddProductionRule(nonterminals["statements_tail"], new ISymbol[] { Terminal.EPSILON }); grammar.AddProductionRule(nonterminals["statement"], new ISymbol[] { nonterminals["declaration"] }); grammar.AddProductionRule(nonterminals["statement"], new ISymbol[] { terminals["identifier"], terminals[":="], nonterminals["expression"] }); grammar.AddProductionRule(nonterminals["statement"], new ISymbol[] { terminals["for"], terminals["identifier"], terminals["in"], nonterminals["expression"], terminals[".."], nonterminals["expression"], terminals["do"], nonterminals["statements"], terminals["end"], terminals["for"] }); grammar.AddProductionRule(nonterminals["statement"], new ISymbol[] { terminals["read"], terminals["identifier"] }); grammar.AddProductionRule(nonterminals["statement"], new ISymbol[] { terminals["print"], nonterminals["expression"] }); grammar.AddProductionRule(nonterminals["statement"], new ISymbol[] { terminals["assert"], terminals["("], nonterminals["expression"], terminals[")"] }); grammar.AddProductionRule(nonterminals["declaration"], new ISymbol[] { terminals["var"], terminals["identifier"], terminals[":"], terminals["type"], nonterminals["declaration_assignment"] }); grammar.AddProductionRule(nonterminals["declaration_assignment"], new ISymbol[] { terminals[":="], nonterminals["expression"] }); grammar.AddProductionRule(nonterminals["declaration_assignment"], new ISymbol[] { Terminal.EPSILON }); grammar.AddProductionRule(nonterminals["expression"], new ISymbol[] { nonterminals["unary_operation"] }); grammar.AddProductionRule(nonterminals["expression"], new ISymbol[] { nonterminals["operand"], nonterminals["binary_operation"] }); grammar.AddProductionRule(nonterminals["unary_operation"], new ISymbol[] { terminals["unary_operator"], nonterminals["operand"] }); grammar.AddProductionRule(nonterminals["binary_operation"], new ISymbol[] { terminals["binary_operator"], nonterminals["operand"] }); grammar.AddProductionRule(nonterminals["binary_operation"], new ISymbol[] { Terminal.EPSILON }); grammar.AddProductionRule(nonterminals["operand"], new ISymbol[] { terminals["int"] }); grammar.AddProductionRule(nonterminals["operand"], new ISymbol[] { terminals["string"] }); grammar.AddProductionRule(nonterminals["operand"], new ISymbol[] { terminals["identifier"] }); grammar.AddProductionRule(nonterminals["operand"], new ISymbol[] { terminals["("], nonterminals["expression"], terminals[")"] }); // use ; as synchronizing token for Mini-PL parser = new Parser(grammar, terminals[";"]); }
public ParseLeaf(Terminal terminal) { this.terminal = terminal; }
// Parse the given stream of tokens public ParseTree Parse(IEnumerable <Token> tokenSource) { isValidParseTree = true; errors = new List <Error>(); Stack <ISymbol> symbolStack = new Stack <ISymbol>(); symbolStack.Push(Terminal.EOF); symbolStack.Push(start); ParseTree parseTree = new ParseTree(start); Stack <IParseNode> treeStack = new Stack <IParseNode>(); treeStack.Push(new ParseLeaf(Terminal.EOF)); treeStack.Push(parseTree); IEnumerator <Token> tokenStream = tokenSource.GetEnumerator(); tokenStream.MoveNext(); while (symbolStack.Count > 0) { if (Program.debug) { Console.WriteLine("========================================================="); Console.WriteLine(" PARSE: Stack " + SymbolsToString(symbolStack)); Console.WriteLine(" PARSE: expecting " + symbolStack.Peek()); Console.WriteLine(" PARSE: token " + tokenStream.Current); } // ignore error tokens if (tokenStream.Current.Type == TokenType.ERROR) { if (Program.debug) { Console.WriteLine(" PARSE: skipping error token"); } errors.Add(new LexicalError(tokenStream.Current)); tokenStream.MoveNext(); continue; } if (symbolStack.Peek() is Terminal) { Terminal term = symbolStack.Peek() as Terminal; ParseLeaf leaf = treeStack.Peek() as ParseLeaf; if (term == Terminal.EPSILON) { // epsilon production was used, exclude from parse tree if (Program.debug) { Console.WriteLine(" PARSE: ignore epsilon"); } symbolStack.Pop(); treeStack.Pop(); } else if (term.Matches(tokenStream.Current)) { // current token matches the top of the parse stack, add it to parse tree if (Program.debug) { Console.WriteLine(" PARSE: Terminal match"); } leaf.Token = tokenStream.Current; tokenStream.MoveNext(); symbolStack.Pop(); treeStack.Pop(); } else { // current token does no match, recover from error if (Program.debug) { Console.WriteLine(" PARSE: Error, Terminal mismatch"); } errors.Add(new SyntaxError(tokenStream.Current)); Synchronize(symbolStack, treeStack, tokenStream); } } else // top of stack is a nonterminal { Nonterminal var = symbolStack.Pop() as Nonterminal; IParseNode popped = treeStack.Pop(); ParseTree subtree = popped as ParseTree; ISymbol[] production = table.Get(var, tokenStream.Current); if (production == null) { // cannot derive the current token from the nonterminal at the top of the stack if (Program.debug) { Console.WriteLine(" PARSE: Error, No such production"); } symbolStack.Push(var); treeStack.Push(popped); errors.Add(new SyntaxError(tokenStream.Current)); Synchronize(symbolStack, treeStack, tokenStream); } else { // use the production specified by the parse table, add node to parse tree if (Program.debug) { Console.WriteLine(" PARSE: Using production " + SymbolsToString(production)); } for (int i = production.Length - 1; i >= 0; i--) { IParseNode treeChild; if (production[i] is Terminal) { treeChild = new ParseLeaf(production[i] as Terminal); } else { treeChild = new ParseTree(production[i] as Nonterminal); } subtree.Children.Insert(0, treeChild); treeStack.Push(treeChild); symbolStack.Push(production[i]); } } } } if (Program.debug) { Console.WriteLine(parseTree); } return(parseTree); }
// Uses phrase-level error recovery with First and Follow sets to recover from bad state private void Synchronize(Stack <ISymbol> symbolStack, Stack <IParseNode> treeStack, IEnumerator <Token> tokenStream) { // Loop until good state found (return statements) or no more symbols on stack. sync : while (symbolStack.Count > 0) { if (Program.debug) { Console.WriteLine(" PARSE: Synchronize token " + tokenStream.Current + " symbol " + symbolStack.Peek()); } // no recovery from end of file, empty the parse stack if (tokenStream.Current.Type == TokenType.EOF) { while (symbolStack.Count > 0) { isValidParseTree = false; symbolStack.Pop(); treeStack.Pop(); } if (Program.debug) { Console.WriteLine("PARSE: Unexpected EOF"); } return; } if (symbolStack.Peek() is Terminal) { Terminal t = symbolStack.Peek() as Terminal; if (t.Matches(tokenStream.Current)) { // good state reached if (Program.debug) { Console.WriteLine(" PARSE: Token matches"); } return; } if (syncTerm.Matches(tokenStream.Current)) { // special case: we have a synchronising token like ';' that we do not want to skip over // so remove symbol from parse stack if (Program.debug) { Console.WriteLine(" PARSE: Discarding symbol " + symbolStack.Peek()); } isValidParseTree = false; symbolStack.Pop(); treeStack.Pop(); continue; } // normal case: discard token and continue with recovery if (Program.debug) { Console.WriteLine(" PARSE: Discard token"); } if (!tokenStream.MoveNext()) { throw new Exception("OUT OF TOKENS"); } continue; } else { Nonterminal v = symbolStack.Peek() as Nonterminal; if (table.Get(v, tokenStream.Current) != null) { // good state reached (current token in First set of symbol at top of stack) if (Program.debug) { Console.WriteLine(" PARSE: Valid production exists"); } return; } // if current terminal could be matched by something in Follow set of // symbol at top of stack, then skip the current symbol foreach (ISymbol sym in grammar.Follow(v)) { if (sym is Terminal) { Terminal followTerm = sym as Terminal; if (followTerm.Matches(tokenStream.Current)) { if (Program.debug) { Console.WriteLine(" PARSE: Discarding symbol " + symbolStack.Peek()); } isValidParseTree = false; symbolStack.Pop(); treeStack.Pop(); goto sync; } } if (sym is Nonterminal) { Nonterminal followVar = sym as Nonterminal; if (table.Get(followVar, tokenStream.Current) != null) { if (Program.debug) { Console.WriteLine(" PARSE: Discarding symbol " + symbolStack.Peek()); } isValidParseTree = false; symbolStack.Pop(); treeStack.Pop(); goto sync; } } } // default action: skip current terminal if (Program.debug) { Console.WriteLine(" PARSE: Discard token"); } if (!tokenStream.MoveNext()) { throw new Exception("OUT OF TOKENS"); } } } }