public void Regex_ConcatTest() { Regex ab = Regex.Concat("ab"); TokenType abType = new TokenType("ab", ab); TokenAutomaton automaton = abType.Automaton(); List<Token> tokens = GetTokens(automaton, "ababab"); Assert.AreEqual(3, tokens.Count); for (int i = 0; i < 3; i++) Assert.AreEqual(tokens[i].Lexeme, "ab"); }
public void Scanner_BlockCommentTest() { Regex a = Regex.Char('a').Star(); Regex b = Regex.Char('b').Star(); // block comment regex /\*([^*]|(\*+[^*/]))\*+/ Regex blockComment = Regex.Concat("/*") .Concat(Regex.Not('*') .Union(Regex.Char('*').Plus() .Concat(Regex.Not('*','/'))) .Star()) .Concat(Regex.Char('*').Plus().Concat(Regex.Char('/'))); Regex whitespace = Regex.Union(" \t\n").Star(); TokenType ttWhitespace = new TokenType("Whitespace", whitespace, priority: TokenType.Priority.Whitespace); TokenType ttA = new TokenType("a*", a); TokenType ttB = new TokenType("b*", b); TokenType ttBlockComment = new TokenType("line comment", blockComment, priority: TokenType.Priority.Whitespace); TokenAutomaton automaton = TokenType.CombinedAutomaton(ttA, ttB, ttBlockComment, ttWhitespace); string text = "/* aaa */ \n" + "bbb \n" + "/* aaa */ \n" + "bbb \n" + "/* aaa \n" + " aaa */ \n" + "bbb \n" + "/* \n" + " * aaa \n" + " */ \n" + "bbb \n" + "/*** \n" + " * aaa \n" + " ***/ \n" + "bbb \n" + "/*/ aaa /*/ \n" + "/*****/ \n" + "/*///*/ \n" + "bbb \n" + "/*** \n" + " * aaa \n" + " */ \n" + "bbb "; Scanner sc = new Scanner(automaton); IEnumerable<Token> tokenEnumerable = sc.Tokenize(text, yieldEOF: false); List<Token> tokens = new List<Token>(tokenEnumerable); string[] expectedTokens = { "bbb", "bbb", "bbb", "bbb", "bbb", "bbb", "bbb" }; Assert.AreEqual(expectedTokens.Length, tokens.Count); for (int i = 0; i < expectedTokens.Length; i++) { Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme); Assert.AreEqual(ttB, tokens[i].Type); } }
public void Regex_CharacterTest() { Regex a = Regex.Char('a'); TokenType aType = new TokenType("a", a); TokenAutomaton automaton = aType.Automaton(); List<Token> tokens = GetTokens(automaton, "aaaaa"); Assert.AreEqual(5, tokens.Count); for (int i = 0; i < 5; i++) Assert.AreEqual(tokens[i].Lexeme, "a"); }
public void Regex_MaybeTest() { Regex ab = Regex.Char('a').Maybe().Concat(Regex.Char('b')); TokenType abType = new TokenType("a?b", ab); TokenAutomaton automaton = abType.Automaton(); List<Token> tokens = GetTokens(automaton, "babbab"); string[] expectedTokens = { "b","ab","b","ab" }; Assert.AreEqual(expectedTokens.Length, tokens.Count); for (int i = 0; i < expectedTokens.Length; i++) Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme); }
public void Regex_AnyTest() { Regex any = Regex.Any(); TokenType anyType = new TokenType("any", any); TokenAutomaton automaton = anyType.Automaton(); List<Token> tokens = GetTokens(automaton, "9jQbksjhbQ3b"); string[] expectedTokens = { "9", "j", "Q", "b", "k", "s", "j", "h", "b", "Q", "3", "b" }; Assert.AreEqual(expectedTokens.Length, tokens.Count); for (int i = 0; i < expectedTokens.Length; i++) { Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme); Assert.AreEqual(anyType, tokens[i].Type); } }
public void Scanner_ErrorTest() { Regex a = Regex.Char('a'); TokenType aToken = new TokenType("a", a); TokenAutomaton automaton = aToken.Automaton(); List<Token> tokens = GetTokens(automaton, "ccaacaacc"); string[] expectedTokens = {"c","c","a","a","c","a","a","c","c"}; Assert.AreEqual(expectedTokens.Length, tokens.Count); for (int i = 0; i < expectedTokens.Length; i++) { Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme); if (tokens[i].Lexeme == "a") Assert.AreEqual(aToken, tokens[i].Type); else Assert.AreEqual(TokenType.ERROR, tokens[i].Type); } }
public void Regex_NotTest() { Regex b = Regex.Char('b'); Regex notb = Regex.Not('b'); TokenType bType = new TokenType("b", b); TokenType notbType = new TokenType("not b", notb); TokenAutomaton automaton = TokenType.CombinedAutomaton(bType, notbType); List<Token> tokens = GetTokens(automaton, "9jQbksjhbQ3b"); string[] expectedTokens = { "9","j","Q","b","k","s","j","h","b","Q","3","b" }; Assert.AreEqual(expectedTokens.Length, tokens.Count); for (int i = 0; i < expectedTokens.Length; i++) { Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme); if (expectedTokens[i] == "b") Assert.AreEqual("b", tokens[i].Type.Name); else Assert.AreEqual("not b", tokens[i].Type.Name); } }
public void Scanner_StringTest() { Regex whitespace = Regex.Union(" \t\n").Star(); Regex str = Regex.Char('"').Concat(Regex.Not('"').Star()).Concat(Regex.Char('"')); TokenType ttWhitespace = new TokenType("Whitespace", whitespace); TokenType ttString = new TokenType("string", str); TokenAutomaton automaton = TokenType.CombinedAutomaton(ttWhitespace, ttString); List<Token> tokens = GetTokens(automaton, "\"asdf\" \"sdfg\""); string[] expectedTokens = { "\"asdf\"", " ", "\"sdfg\"" }; Assert.AreEqual(expectedTokens.Length, tokens.Count); for (int i = 0; i < expectedTokens.Length; i++) Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme); }
// can be a terminal matching either a specific string or token type public Terminal(TokenType tokenType) { this.matchedTokenType = tokenType; }
public void Scanner_NestedCommentHackTest3() { Regex a = Regex.Char('a').Star(); Regex b = Regex.Char('b').Star(); // hacky solution that allows for nested comments Regex reBlockCommentStart = Regex.Concat("/*"); Regex reBlockCommentEnd = Regex.Concat("*/"); Regex whitespace = Regex.Union(" \t\n").Star(); TokenType ttWhitespace = new TokenType("Whitespace", whitespace, priority: TokenType.Priority.Whitespace); TokenType ttA = new TokenType("a*", a); TokenType ttB = new TokenType("b*", b); TokenType ttBlockCommentStart = new TokenType("block comment start", reBlockCommentStart); TokenType ttBlockCommentEnd = new TokenType("block comment end", reBlockCommentEnd); TokenAutomaton automaton = TokenType.CombinedAutomaton(ttA, ttB, ttBlockCommentStart, ttBlockCommentEnd, ttWhitespace); string text = "bbb /* aaa"; Scanner sc = new Scanner(automaton, ttBlockCommentStart, ttBlockCommentEnd); IEnumerable<Token> tokenEnumerable = sc.Tokenize(text, yieldEOF: false); List<Token> tokens = new List<Token>(tokenEnumerable); string[] expectedTokens = { "bbb" }; Assert.AreEqual(expectedTokens.Length, tokens.Count); for (int i = 0; i < expectedTokens.Length; i++) { Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme); Assert.AreEqual(ttB, tokens[i].Type); } }
public void Scanner_IntegerTest() { Regex whitespace = Regex.Union(" \t\n").Star(); Regex integer = Regex.Char('-').Maybe() .Concat(Regex.Range('1', '9')) .Concat(Regex.Range('0', '9').Star()) .Union(Regex.Char('0')); TokenType ttWhitespace = new TokenType("Whitespace", whitespace); TokenType ttInteger = new TokenType("integer", integer); TokenAutomaton automaton = TokenType.CombinedAutomaton(ttInteger, ttWhitespace); List<Token> tokens = GetTokens(automaton, "1234 0 -99 -1"); string[] expectedTokens = { "1234", " ", "0", " ", "-99", " ", "-1" }; Assert.AreEqual(expectedTokens.Length, tokens.Count); for (int i = 0; i < expectedTokens.Length; i++) Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme); }
public void Scanner_LineCommentTest() { Regex a = Regex.Char('a').Star(); Regex lineComment = Regex.Concat("//").Concat(Regex.Not('\n').Star()); Regex whitespace = Regex.Union(" \t\n").Star(); TokenType ttWhitespace = new TokenType("Whitespace", whitespace, priority: TokenType.Priority.Whitespace); TokenType ttA = new TokenType("a*", a); TokenType ttLineComment = new TokenType("line comment", lineComment, priority: TokenType.Priority.Whitespace); TokenAutomaton automaton = TokenType.CombinedAutomaton(ttA, ttLineComment, ttWhitespace); string text = "aaa//aaa\n"+ "//aaaaaaa\n"+ "aaa//aaa"; List<Token> tokens = GetTokens(automaton, text); string[] expectedTokens = { "aaa", "aaa" }; Assert.AreEqual(expectedTokens.Length, tokens.Count); for (int i = 0; i < expectedTokens.Length; i++) { Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme); } }
// remember the last valid token private void StoreToken(TokenType type) { lastToken = type.CreateToken(charBuffer, startRow, startCol); endRow = row; endCol = col; }
// Define the scanner with a TokenAutomaton (combined DFA of token types) and // optional special token types to recognize as block comment start and end points // so that nested comments can be handled public Scanner(TokenAutomaton automaton, TokenType blockCommentStart = null, TokenType blockCommentEnd = null) { this.automaton = automaton; this.blockCommentStart = blockCommentStart; this.blockCommentEnd = blockCommentEnd; }
public void Regex_RangeTest() { Regex az = Regex.Range('a', 'z'); TokenType azType = new TokenType("az", az); TokenAutomaton automaton = azType.Automaton(); List<Token> tokens = GetTokens(automaton, "abcdefghijklmnopqrstuvwxyz"); string[] expectedTokens = { "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" }; Assert.AreEqual(expectedTokens.Length, tokens.Count); for (int i = 0; i < expectedTokens.Length; i++) Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme); }
public void Scanner_TokenPriorityTest() { // keywords should be matched over regular tokens and whitespace should not be returned Regex keywords = Regex.Union(Regex.Concat("int"), Regex.Concat("bool")); Regex words = Regex.Range('a', 'z').Plus(); Regex whitespace = Regex.Union(" \t\n").Star(); TokenType ttKeyword = new TokenType("keyword", keywords, priority: TokenType.Priority.Keyword); TokenType ttWords = new TokenType("words", words); TokenType ttWhitespace = new TokenType("Whitespace", whitespace, priority: TokenType.Priority.Whitespace); TokenAutomaton automaton = TokenType.CombinedAutomaton(ttWords, ttWhitespace, ttKeyword); List<Token> tokens = GetTokens(automaton, "in int ints bool bools boo"); string[] expectedTokens = { "in", "int", "ints", "bool", "bools", "boo"}; Assert.AreEqual(expectedTokens.Length, tokens.Count); for (int i = 0; i < expectedTokens.Length; i++) Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme); }
public void Regex_UnionTest() { Regex aa = Regex.Concat("aa"); Regex ab = Regex.Union("ab"); TokenType aaType = new TokenType("aa", aa); TokenType abType = new TokenType("a|b", ab); TokenAutomaton automaton = TokenType.CombinedAutomaton(aaType, abType); List<Token> tokens = GetTokens(automaton, "aababaabaa"); string[] expectedTokens = { "aa", "b", "a", "b", "aa", "b", "aa" }; Assert.AreEqual(expectedTokens.Length, tokens.Count); for (int i = 0; i < expectedTokens.Length; i++) Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme); }
public void Scanner_WhitespaceTest() { Regex whitespace = Regex.Union(" \t\n").Star(); Regex a = Regex.Char('a'); TokenType ttWhitespace = new TokenType("Whitespace", whitespace); TokenType ttA = new TokenType("a", a); TokenAutomaton automaton = TokenType.CombinedAutomaton(ttA, ttWhitespace); List<Token> tokens = GetTokens(automaton, "a a \t\na \n a \t\t a \n\t a "); string[] expectedTokens = { "a", " ", "a", " \t\n", "a", " \n ", "a", " \t\t ", "a", " \n\t ", "a", " " }; Assert.AreEqual(expectedTokens.Length, tokens.Count); for (int i = 0; i < expectedTokens.Length; i++) Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme); }
public void Regex_PlusTest() { Regex ba = Regex.Char('b').Concat(Regex.Char('a').Plus()); TokenType baType = new TokenType("ba+", ba); TokenAutomaton automaton = baType.Automaton(); List<Token> tokens = GetTokens(automaton, "baababab"); string[] expectedTokens = { "baa","ba","ba","b" }; Assert.AreEqual(expectedTokens.Length, tokens.Count); for (int i = 0; i < expectedTokens.Length; i++) Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme); Assert.AreEqual(TokenType.ERROR, tokens[3].Type); }
public void Scanner_EscapeStringTest() { Regex whitespace = Regex.Union(" \t\n").Star(); // construct the regex "(\\.|[^"\\])*" Regex strBegin = Regex.Char('"'); Regex strEnd = Regex.Char('"'); Regex strBody = Regex.Char('\\').Concat(Regex.Any()).Union(Regex.Not('"', '\\')).Star(); Regex str = strBegin.Concat(strBody).Concat(strEnd); TokenType ttWhitespace = new TokenType("Whitespace", whitespace); TokenType ttString = new TokenType("string", str); TokenAutomaton automaton = TokenType.CombinedAutomaton(ttWhitespace, ttString); List<Token> tokens = GetTokens(automaton, "\"as\\ndf\\\"\" \"sdfg\\\\\" \"\\\\\"\""); string[] expectedTokens = { "\"as\\ndf\\\"\"", " ", "\"sdfg\\\\\"" , " ", "\"\\\\\"", "\""}; Assert.AreEqual(expectedTokens.Length, tokens.Count); for (int i = 0; i < expectedTokens.Length; i++) Assert.AreEqual(expectedTokens[i], tokens[i].Lexeme); }
private MiniPL() { // Create NFA-type things for tokens using regular operations Regex reBlockCommentStart = Regex.Concat("/*"); Regex reBlockCommentEnd = Regex.Concat("*/"); Regex reLineComment = Regex.Concat("//").Concat(Regex.Not('\n').Star()); Regex reWhitespace = Regex.Union(" \t\r\n").Star().Union(reLineComment); Regex reString = Regex.Char('"').Concat(Regex.Char('\\').Concat(Regex.Any()).Union(Regex.Not('"', '\\')).Star()).Concat(Regex.Char('"')); Regex reBinaryOperator = Regex.Union("+-*/<=&"); Regex reUnaryOperator = Regex.Char('!'); Regex reKeyword = Regex.Union(Regex.Concat("var"), Regex.Concat("for"), Regex.Concat("end"), Regex.Concat("in"), Regex.Concat("do"), Regex.Concat("read"), Regex.Concat("print"), Regex.Concat("assert")); Regex reType = Regex.Union(Regex.Concat("bool"), Regex.Concat("int"), Regex.Concat("string")); Regex reParenRight = Regex.Char(')'), reParenLeft = Regex.Char('('), reColon = Regex.Char(':'), reSemicolon = Regex.Char(';'), reAssignment = Regex.Concat(":="), reDots = Regex.Concat(".."); Regex reIdentifier = Regex.Union(Regex.Range('A', 'Z'), Regex.Range('a', 'z')) .Concat(Regex.Union(Regex.Range('A', 'Z'), Regex.Range('a', 'z'), Regex.Range('0', '9'), Regex.Char('_')).Star()); Regex reInteger = Regex.Range('0', '9').Plus(); // Define token types tokenTypes["block_comment_start"] = new TokenType("block_comment_start", reBlockCommentStart); tokenTypes["block_comment_end"] = new TokenType("block_comment_end", reBlockCommentEnd); tokenTypes["int"] = new TokenType("int", reInteger); tokenTypes["whitespace"] = new TokenType("whitespace", reWhitespace, priority: TokenType.Priority.Whitespace); tokenTypes["string"] = new TokenType("string", reString); tokenTypes["binary_op"] = new TokenType("binary op", reBinaryOperator); tokenTypes["unary_op"] = new TokenType("unary op", reUnaryOperator); tokenTypes["keyword"] = new TokenType("keyword", reKeyword, priority: TokenType.Priority.Keyword); tokenTypes["type"] = new TokenType("type", reType, priority: TokenType.Priority.Keyword); tokenTypes["left_paren"] = new TokenType("left paren", reParenLeft); tokenTypes["right_paren"] = new TokenType("right paren", reParenRight); tokenTypes["colon"] = new TokenType("colon", reColon); tokenTypes["semicolon"] = new TokenType("semicolon", reSemicolon); tokenTypes["assignment"] = new TokenType("assignment", reAssignment); tokenTypes["dots"] = new TokenType("dots", reDots); tokenTypes["identifier"] = new TokenType("identifier", reIdentifier); // create combined automaton and scanner object TokenAutomaton automaton = TokenType.CombinedAutomaton(tokenTypes.Values.ToArray()); scanner = new Scanner(automaton, tokenTypes["block_comment_start"], tokenTypes["block_comment_end"]); // Define nonterminal variables of CFG nonterminals["program"] = new Nonterminal("PROG"); nonterminals["statements"] = new Nonterminal("STMTS"); nonterminals["statements_head"] = new Nonterminal("STMTS_HEAD"); nonterminals["statements_tail"] = new Nonterminal("STMTS_TAIL"); nonterminals["statement"] = new Nonterminal("STMT"); nonterminals["declaration"] = new Nonterminal("DECL"); nonterminals["declaration_assignment"] = new Nonterminal("DECL_ASSIGN"); nonterminals["expression"] = new Nonterminal("EXPR"); nonterminals["unary_operation"] = new Nonterminal("UNARY_OP"); nonterminals["binary_operation"] = new Nonterminal("BINARY_OP"); nonterminals["operand"] = new Nonterminal("OPND"); // Define terminal variables of CFG terminals["identifier"] = new Terminal(tokenTypes["identifier"]); terminals["assert"] = new Terminal("assert"); terminals["print"] = new Terminal("print"); terminals["read"] = new Terminal("read"); terminals["for"] = new Terminal("for"); terminals["in"] = new Terminal("in"); terminals["end"] = new Terminal("end"); terminals["do"] = new Terminal("do"); terminals["var"] = new Terminal("var"); terminals["type"] = new Terminal(tokenTypes["type"]); terminals["string"] = new Terminal(tokenTypes["string"]); terminals["int"] = new Terminal(tokenTypes["int"]); terminals[")"] = new Terminal(")"); terminals["("] = new Terminal("("); terminals[".."] = new Terminal(".."); terminals[":="] = new Terminal(":="); terminals[":"] = new Terminal(":"); terminals[";"] = new Terminal(";"); terminals["binary_operator"] = new Terminal(tokenTypes["binary_op"]); terminals["unary_operator"] = new Terminal(tokenTypes["unary_op"]); // Create the Mini-PL grammar grammar = new CFG(nonterminals["program"], terminals.Values, nonterminals.Values); // define production rules for the grammar grammar.AddProductionRule(nonterminals["program"], new ISymbol[] { nonterminals["statements"] }); grammar.AddProductionRule(nonterminals["statements"], new ISymbol[] { nonterminals["statements_head"], nonterminals["statements_tail"] }); grammar.AddProductionRule(nonterminals["statements_head"], new ISymbol[] { nonterminals["statement"], terminals[";"] }); grammar.AddProductionRule(nonterminals["statements_tail"], new ISymbol[] { nonterminals["statements_head"], nonterminals["statements_tail"] }); grammar.AddProductionRule(nonterminals["statements_tail"], new ISymbol[] { Terminal.EPSILON }); grammar.AddProductionRule(nonterminals["statement"], new ISymbol[] { nonterminals["declaration"] }); grammar.AddProductionRule(nonterminals["statement"], new ISymbol[] { terminals["identifier"], terminals[":="], nonterminals["expression"] }); grammar.AddProductionRule(nonterminals["statement"], new ISymbol[] { terminals["for"], terminals["identifier"], terminals["in"], nonterminals["expression"], terminals[".."], nonterminals["expression"], terminals["do"], nonterminals["statements"], terminals["end"], terminals["for"] }); grammar.AddProductionRule(nonterminals["statement"], new ISymbol[] { terminals["read"], terminals["identifier"] }); grammar.AddProductionRule(nonterminals["statement"], new ISymbol[] { terminals["print"], nonterminals["expression"] }); grammar.AddProductionRule(nonterminals["statement"], new ISymbol[] { terminals["assert"], terminals["("], nonterminals["expression"], terminals[")"] }); grammar.AddProductionRule(nonterminals["declaration"], new ISymbol[] { terminals["var"], terminals["identifier"], terminals[":"], terminals["type"], nonterminals["declaration_assignment"] }); grammar.AddProductionRule(nonterminals["declaration_assignment"], new ISymbol[] { terminals[":="], nonterminals["expression"] }); grammar.AddProductionRule(nonterminals["declaration_assignment"], new ISymbol[] { Terminal.EPSILON }); grammar.AddProductionRule(nonterminals["expression"], new ISymbol[] { nonterminals["unary_operation"] }); grammar.AddProductionRule(nonterminals["expression"], new ISymbol[] { nonterminals["operand"], nonterminals["binary_operation"] }); grammar.AddProductionRule(nonterminals["unary_operation"], new ISymbol[] { terminals["unary_operator"], nonterminals["operand"] }); grammar.AddProductionRule(nonterminals["binary_operation"], new ISymbol[] { terminals["binary_operator"], nonterminals["operand"] }); grammar.AddProductionRule(nonterminals["binary_operation"], new ISymbol[] { Terminal.EPSILON }); grammar.AddProductionRule(nonterminals["operand"], new ISymbol[] { terminals["int"] }); grammar.AddProductionRule(nonterminals["operand"], new ISymbol[] { terminals["string"] }); grammar.AddProductionRule(nonterminals["operand"], new ISymbol[] { terminals["identifier"] }); grammar.AddProductionRule(nonterminals["operand"], new ISymbol[] { terminals["("], nonterminals["expression"], terminals[")"] }); // use ; as synchronizing token for Mini-PL parser = new Parser(grammar, terminals[";"]); }
public void DefineTokenClass(TokenType tokenClass) { this.end.tokenType = tokenClass; }
public Token(TokenType type, String lexeme, int row, int col) { this.type = type; this.lexeme = lexeme; this.textPosition = new Position(row, col); }