        public void Scanner_NestedCommentHackTest3()
            Regex a = Regex.Char('a').Star();
            Regex b = Regex.Char('b').Star();
            // hacky solution that allows for nested comments
            Regex          reBlockCommentStart = Regex.Concat("/*");
            Regex          reBlockCommentEnd   = Regex.Concat("*/");
            Regex          whitespace          = Regex.Union(" \t\n").Star();
            TokenType      ttWhitespace        = new TokenType("Whitespace", whitespace, priority: TokenType.Priority.Whitespace);
            TokenType      ttA = new TokenType("a*", a);
            TokenType      ttB = new TokenType("b*", b);
            TokenType      ttBlockCommentStart = new TokenType("block comment start", reBlockCommentStart);
            TokenType      ttBlockCommentEnd   = new TokenType("block comment end", reBlockCommentEnd);
            TokenAutomaton automaton           = TokenType.CombinedAutomaton(ttA, ttB, ttBlockCommentStart, ttBlockCommentEnd, ttWhitespace);

            string text = "bbb /* aaa";

            Scanner             sc = new Scanner(automaton, ttBlockCommentStart, ttBlockCommentEnd);
            IEnumerable <Token> tokenEnumerable = sc.Tokenize(text, yieldEOF: false);
            List <Token>        tokens          = new List <Token>(tokenEnumerable);

            string[] expectedTokens = { "bbb" };
            Assert.AreEqual(expectedTokens.Length, tokens.Count);
            for (int i = 0; i < expectedTokens.Length; i++)
                Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme);
                Assert.AreEqual(ttB, tokens[i].Type);
        public void Scanner_BlockCommentTest()
            Regex a = Regex.Char('a').Star();
            Regex b = Regex.Char('b').Star();
            // block comment regex /\*([^*]|(\*+[^*/]))\*+/
            Regex blockComment = Regex.Concat("/*")
                                                .Concat(Regex.Not('*', '/')))
            Regex          whitespace     = Regex.Union(" \t\n").Star();
            TokenType      ttWhitespace   = new TokenType("Whitespace", whitespace, priority: TokenType.Priority.Whitespace);
            TokenType      ttA            = new TokenType("a*", a);
            TokenType      ttB            = new TokenType("b*", b);
            TokenType      ttBlockComment = new TokenType("line comment", blockComment, priority: TokenType.Priority.Whitespace);
            TokenAutomaton automaton      = TokenType.CombinedAutomaton(ttA, ttB, ttBlockComment, ttWhitespace);

            string text = "/* aaa */   \n" +
                          "bbb         \n" +
                          "/* aaa */   \n" +
                          "bbb         \n" +
                          "/* aaa      \n" +
                          "   aaa */   \n" +
                          "bbb         \n" +
                          "/*          \n" +
                          " * aaa      \n" +
                          " */         \n" +
                          "bbb         \n" +
                          "/***        \n" +
                          " * aaa      \n" +
                          " ***/       \n" +
                          "bbb         \n" +
                          "/*/ aaa /*/ \n" +
                          "/*****/     \n" +
                          "/*///*/     \n" +
                          "bbb         \n" +
                          "/***        \n" +
                          " * aaa      \n" +
                          " */         \n" +
                          "bbb        ";

            Scanner             sc = new Scanner(automaton);
            IEnumerable <Token> tokenEnumerable = sc.Tokenize(text, yieldEOF: false);
            List <Token>        tokens          = new List <Token>(tokenEnumerable);

            string[] expectedTokens = { "bbb", "bbb", "bbb", "bbb", "bbb", "bbb", "bbb" };
            Assert.AreEqual(expectedTokens.Length, tokens.Count);
            for (int i = 0; i < expectedTokens.Length; i++)
                Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme);
                Assert.AreEqual(ttB, tokens[i].Type);
        public void Regex_UnionTest()
            Regex          aa        = Regex.Concat("aa");
            Regex          ab        = Regex.Union("ab");
            TokenType      aaType    = new TokenType("aa", aa);
            TokenType      abType    = new TokenType("a|b", ab);
            TokenAutomaton automaton = TokenType.CombinedAutomaton(aaType, abType);
            List <Token>   tokens    = GetTokens(automaton, "aababaabaa");

            string[] expectedTokens = { "aa", "b", "a", "b", "aa", "b", "aa" };
            Assert.AreEqual(expectedTokens.Length, tokens.Count);
            for (int i = 0; i < expectedTokens.Length; i++)
                Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme);
        public void Scanner_StringTest()
            Regex          whitespace   = Regex.Union(" \t\n").Star();
            Regex          str          = Regex.Char('"').Concat(Regex.Not('"').Star()).Concat(Regex.Char('"'));
            TokenType      ttWhitespace = new TokenType("Whitespace", whitespace);
            TokenType      ttString     = new TokenType("string", str);
            TokenAutomaton automaton    = TokenType.CombinedAutomaton(ttWhitespace, ttString);
            List <Token>   tokens       = GetTokens(automaton, "\"asdf\" \"sdfg\"");

            string[] expectedTokens = { "\"asdf\"", " ", "\"sdfg\"" };

            Assert.AreEqual(expectedTokens.Length, tokens.Count);
            for (int i = 0; i < expectedTokens.Length; i++)
                Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme);
        public void Scanner_WhitespaceTest()
            Regex          whitespace   = Regex.Union(" \t\n").Star();
            Regex          a            = Regex.Char('a');
            TokenType      ttWhitespace = new TokenType("Whitespace", whitespace);
            TokenType      ttA          = new TokenType("a", a);
            TokenAutomaton automaton    = TokenType.CombinedAutomaton(ttA, ttWhitespace);
            List <Token>   tokens       = GetTokens(automaton, "a   a  \t\na  \n  a   \t\t   a     \n\t a ");

            string[] expectedTokens = { "a", "   ", "a", "  \t\n", "a", "  \n  ", "a", "   \t\t   ", "a", "     \n\t ", "a", " " };

            Assert.AreEqual(expectedTokens.Length, tokens.Count);
            for (int i = 0; i < expectedTokens.Length; i++)
                Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme);
        public void Scanner_TokenPriorityTest()
            // keywords should be matched over regular tokens and whitespace should not be returned
            Regex          keywords     = Regex.Union(Regex.Concat("int"), Regex.Concat("bool"));
            Regex          words        = Regex.Range('a', 'z').Plus();
            Regex          whitespace   = Regex.Union(" \t\n").Star();
            TokenType      ttKeyword    = new TokenType("keyword", keywords, priority: TokenType.Priority.Keyword);
            TokenType      ttWords      = new TokenType("words", words);
            TokenType      ttWhitespace = new TokenType("Whitespace", whitespace, priority: TokenType.Priority.Whitespace);
            TokenAutomaton automaton    = TokenType.CombinedAutomaton(ttWords, ttWhitespace, ttKeyword);
            List <Token>   tokens       = GetTokens(automaton, "in int ints bool bools boo");

            string[] expectedTokens = { "in", "int", "ints", "bool", "bools", "boo" };

            Assert.AreEqual(expectedTokens.Length, tokens.Count);
            for (int i = 0; i < expectedTokens.Length; i++)
                Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme);
        public void Scanner_IntegerTest()
            Regex whitespace = Regex.Union(" \t\n").Star();
            Regex integer    = Regex.Char('-').Maybe()
                               .Concat(Regex.Range('1', '9'))
                               .Concat(Regex.Range('0', '9').Star())
            TokenType      ttWhitespace = new TokenType("Whitespace", whitespace);
            TokenType      ttInteger    = new TokenType("integer", integer);
            TokenAutomaton automaton    = TokenType.CombinedAutomaton(ttInteger, ttWhitespace);
            List <Token>   tokens       = GetTokens(automaton, "1234 0 -99 -1");

            string[] expectedTokens = { "1234", " ", "0", " ", "-99", " ", "-1" };

            Assert.AreEqual(expectedTokens.Length, tokens.Count);
            for (int i = 0; i < expectedTokens.Length; i++)
                Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme);
        public void Scanner_EscapeStringTest()
            Regex whitespace = Regex.Union(" \t\n").Star();
            // construct the regex "(\\.|[^"\\])*"
            Regex          strBegin     = Regex.Char('"');
            Regex          strEnd       = Regex.Char('"');
            Regex          strBody      = Regex.Char('\\').Concat(Regex.Any()).Union(Regex.Not('"', '\\')).Star();
            Regex          str          = strBegin.Concat(strBody).Concat(strEnd);
            TokenType      ttWhitespace = new TokenType("Whitespace", whitespace);
            TokenType      ttString     = new TokenType("string", str);
            TokenAutomaton automaton    = TokenType.CombinedAutomaton(ttWhitespace, ttString);
            List <Token>   tokens       = GetTokens(automaton, "\"as\\ndf\\\"\" \"sdfg\\\\\" \"\\\\\"\"");

            string[] expectedTokens = { "\"as\\ndf\\\"\"", " ", "\"sdfg\\\\\"", " ", "\"\\\\\"", "\"" };

            Assert.AreEqual(expectedTokens.Length, tokens.Count);
            for (int i = 0; i < expectedTokens.Length; i++)
                Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme);
        public void Scanner_LineCommentTest()
            Regex          a             = Regex.Char('a').Star();
            Regex          lineComment   = Regex.Concat("//").Concat(Regex.Not('\n').Star());
            Regex          whitespace    = Regex.Union(" \t\n").Star();
            TokenType      ttWhitespace  = new TokenType("Whitespace", whitespace, priority: TokenType.Priority.Whitespace);
            TokenType      ttA           = new TokenType("a*", a);
            TokenType      ttLineComment = new TokenType("line comment", lineComment, priority: TokenType.Priority.Whitespace);
            TokenAutomaton automaton     = TokenType.CombinedAutomaton(ttA, ttLineComment, ttWhitespace);

            string text = "aaa//aaa\n" +
                          "//aaaaaaa\n" +

            List <Token> tokens = GetTokens(automaton, text);

            string[] expectedTokens = { "aaa", "aaa" };
            Assert.AreEqual(expectedTokens.Length, tokens.Count);
            for (int i = 0; i < expectedTokens.Length; i++)
                Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme);
        public void Regex_NotTest()
            Regex          b         = Regex.Char('b');
            Regex          notb      = Regex.Not('b');
            TokenType      bType     = new TokenType("b", b);
            TokenType      notbType  = new TokenType("not b", notb);
            TokenAutomaton automaton = TokenType.CombinedAutomaton(bType, notbType);
            List <Token>   tokens    = GetTokens(automaton, "9jQbksjhbQ3b");

            string[] expectedTokens = { "9", "j", "Q", "b", "k", "s", "j", "h", "b", "Q", "3", "b" };
            Assert.AreEqual(expectedTokens.Length, tokens.Count);
            for (int i = 0; i < expectedTokens.Length; i++)
                Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme);
                if (expectedTokens[i] == "b")
                    Assert.AreEqual("b", tokens[i].Type.Name);
                    Assert.AreEqual("not b", tokens[i].Type.Name);
        private MiniPL()
            // Create NFA-type things for tokens using regular operations
            Regex reBlockCommentStart = Regex.Concat("/*");
            Regex reBlockCommentEnd   = Regex.Concat("*/");

            Regex reLineComment = Regex.Concat("//").Concat(Regex.Not('\n').Star());

            Regex reWhitespace = Regex.Union(" \t\r\n").Star().Union(reLineComment);

            Regex reString         = Regex.Char('"').Concat(Regex.Char('\\').Concat(Regex.Any()).Union(Regex.Not('"', '\\')).Star()).Concat(Regex.Char('"'));
            Regex reBinaryOperator = Regex.Union("+-*/<=&");
            Regex reUnaryOperator  = Regex.Char('!');
            Regex reKeyword        = Regex.Union(Regex.Concat("var"), Regex.Concat("for"), Regex.Concat("end"), Regex.Concat("in"),
                                                 Regex.Concat("do"), Regex.Concat("read"), Regex.Concat("print"), Regex.Concat("assert"));
            Regex reType = Regex.Union(Regex.Concat("bool"), Regex.Concat("int"), Regex.Concat("string"));

            Regex reParenRight = Regex.Char(')'),
                  reParenLeft  = Regex.Char('('),
                  reColon      = Regex.Char(':'),
                  reSemicolon  = Regex.Char(';'),
                  reAssignment = Regex.Concat(":="),
                  reDots       = Regex.Concat("..");

            Regex reIdentifier = Regex.Union(Regex.Range('A', 'Z'), Regex.Range('a', 'z'))
                                 .Concat(Regex.Union(Regex.Range('A', 'Z'), Regex.Range('a', 'z'), Regex.Range('0', '9'), Regex.Char('_')).Star());
            Regex reInteger = Regex.Range('0', '9').Plus();

            // Define token types
            tokenTypes["block_comment_start"] = new TokenType("block_comment_start", reBlockCommentStart);
            tokenTypes["block_comment_end"]   = new TokenType("block_comment_end", reBlockCommentEnd);
            tokenTypes["int"]         = new TokenType("int", reInteger);
            tokenTypes["whitespace"]  = new TokenType("whitespace", reWhitespace, priority: TokenType.Priority.Whitespace);
            tokenTypes["string"]      = new TokenType("string", reString);
            tokenTypes["binary_op"]   = new TokenType("binary op", reBinaryOperator);
            tokenTypes["unary_op"]    = new TokenType("unary op", reUnaryOperator);
            tokenTypes["keyword"]     = new TokenType("keyword", reKeyword, priority: TokenType.Priority.Keyword);
            tokenTypes["type"]        = new TokenType("type", reType, priority: TokenType.Priority.Keyword);
            tokenTypes["left_paren"]  = new TokenType("left paren", reParenLeft);
            tokenTypes["right_paren"] = new TokenType("right paren", reParenRight);
            tokenTypes["colon"]       = new TokenType("colon", reColon);
            tokenTypes["semicolon"]   = new TokenType("semicolon", reSemicolon);
            tokenTypes["assignment"]  = new TokenType("assignment", reAssignment);
            tokenTypes["dots"]        = new TokenType("dots", reDots);
            tokenTypes["identifier"]  = new TokenType("identifier", reIdentifier);

            // create combined automaton and scanner object
            TokenAutomaton automaton = TokenType.CombinedAutomaton(tokenTypes.Values.ToArray());

            scanner = new Scanner(automaton, tokenTypes["block_comment_start"], tokenTypes["block_comment_end"]);

            // Define nonterminal variables of CFG
            nonterminals["program"]                = new Nonterminal("PROG");
            nonterminals["statements"]             = new Nonterminal("STMTS");
            nonterminals["statements_head"]        = new Nonterminal("STMTS_HEAD");
            nonterminals["statements_tail"]        = new Nonterminal("STMTS_TAIL");
            nonterminals["statement"]              = new Nonterminal("STMT");
            nonterminals["declaration"]            = new Nonterminal("DECL");
            nonterminals["declaration_assignment"] = new Nonterminal("DECL_ASSIGN");
            nonterminals["expression"]             = new Nonterminal("EXPR");
            nonterminals["unary_operation"]        = new Nonterminal("UNARY_OP");
            nonterminals["binary_operation"]       = new Nonterminal("BINARY_OP");
            nonterminals["operand"]                = new Nonterminal("OPND");

            // Define terminal variables of CFG
            terminals["identifier"]      = new Terminal(tokenTypes["identifier"]);
            terminals["assert"]          = new Terminal("assert");
            terminals["print"]           = new Terminal("print");
            terminals["read"]            = new Terminal("read");
            terminals["for"]             = new Terminal("for");
            terminals["in"]              = new Terminal("in");
            terminals["end"]             = new Terminal("end");
            terminals["do"]              = new Terminal("do");
            terminals["var"]             = new Terminal("var");
            terminals["type"]            = new Terminal(tokenTypes["type"]);
            terminals["string"]          = new Terminal(tokenTypes["string"]);
            terminals["int"]             = new Terminal(tokenTypes["int"]);
            terminals[")"]               = new Terminal(")");
            terminals["("]               = new Terminal("(");
            terminals[".."]              = new Terminal("..");
            terminals[":="]              = new Terminal(":=");
            terminals[":"]               = new Terminal(":");
            terminals[";"]               = new Terminal(";");
            terminals["binary_operator"] = new Terminal(tokenTypes["binary_op"]);
            terminals["unary_operator"]  = new Terminal(tokenTypes["unary_op"]);

            // Create the Mini-PL grammar
            grammar = new CFG(nonterminals["program"], terminals.Values, nonterminals.Values);

            // define production rules for the grammar
            grammar.AddProductionRule(nonterminals["program"], new ISymbol[] { nonterminals["statements"] });
            grammar.AddProductionRule(nonterminals["statements"], new ISymbol[] { nonterminals["statements_head"], nonterminals["statements_tail"] });
            grammar.AddProductionRule(nonterminals["statements_head"], new ISymbol[] { nonterminals["statement"], terminals[";"] });
            grammar.AddProductionRule(nonterminals["statements_tail"], new ISymbol[] { nonterminals["statements_head"], nonterminals["statements_tail"] });
            grammar.AddProductionRule(nonterminals["statements_tail"], new ISymbol[] { Terminal.EPSILON });

            grammar.AddProductionRule(nonterminals["statement"], new ISymbol[] { nonterminals["declaration"] });
            grammar.AddProductionRule(nonterminals["statement"], new ISymbol[] { terminals["identifier"], terminals[":="], nonterminals["expression"] });
            grammar.AddProductionRule(nonterminals["statement"], new ISymbol[] { terminals["for"], terminals["identifier"], terminals["in"], nonterminals["expression"], terminals[".."], nonterminals["expression"], terminals["do"],
                                                                                 nonterminals["statements"], terminals["end"], terminals["for"] });
            grammar.AddProductionRule(nonterminals["statement"], new ISymbol[] { terminals["read"], terminals["identifier"] });
            grammar.AddProductionRule(nonterminals["statement"], new ISymbol[] { terminals["print"], nonterminals["expression"] });
            grammar.AddProductionRule(nonterminals["statement"], new ISymbol[] { terminals["assert"], terminals["("], nonterminals["expression"], terminals[")"] });

            grammar.AddProductionRule(nonterminals["declaration"], new ISymbol[] { terminals["var"], terminals["identifier"], terminals[":"], terminals["type"], nonterminals["declaration_assignment"] });
            grammar.AddProductionRule(nonterminals["declaration_assignment"], new ISymbol[] { terminals[":="], nonterminals["expression"] });
            grammar.AddProductionRule(nonterminals["declaration_assignment"], new ISymbol[] { Terminal.EPSILON });

            grammar.AddProductionRule(nonterminals["expression"], new ISymbol[] { nonterminals["unary_operation"] });
            grammar.AddProductionRule(nonterminals["expression"], new ISymbol[] { nonterminals["operand"], nonterminals["binary_operation"] });

            grammar.AddProductionRule(nonterminals["unary_operation"], new ISymbol[] { terminals["unary_operator"], nonterminals["operand"] });

            grammar.AddProductionRule(nonterminals["binary_operation"], new ISymbol[] { terminals["binary_operator"], nonterminals["operand"] });
            grammar.AddProductionRule(nonterminals["binary_operation"], new ISymbol[] { Terminal.EPSILON });

            grammar.AddProductionRule(nonterminals["operand"], new ISymbol[] { terminals["int"] });
            grammar.AddProductionRule(nonterminals["operand"], new ISymbol[] { terminals["string"] });
            grammar.AddProductionRule(nonterminals["operand"], new ISymbol[] { terminals["identifier"] });
            grammar.AddProductionRule(nonterminals["operand"], new ISymbol[] { terminals["("], nonterminals["expression"], terminals[")"] });

            // use ; as synchronizing token for Mini-PL
            parser = new Parser(grammar, terminals[";"]);