Exemplo n.º 1
0
        public Lexer <int, int> CreateLexer()
        {
            // For assembling string constants
            LexPattern    lex_pattern    = null;
            StringBuilder str_buf        = null;
            bool          code_statement = false;

            var lexer = new Lexer <int, int>(StringRep.CreateInt <Symbols>(), StringRep.CreateInt <States>(), States.GRAMMAR, Symbols.EOF, Symbols.Error);

            string whitespace_pattern = "[ \r\n\f\t\u000b]+";

            lexer.AddStringRule("using", Symbols.USING, States.GRAMMAR);
            lexer.AddStringRule("namespace", Symbols.NAMESPACE, States.GRAMMAR);

            lexer.AddStringAction("parser", match =>
            {
                match.Token = Symbols.PARSER;
                lexer.PushState(States.FACTORY_SECTION);
            },
                                  States.GRAMMAR);
            lexer.AddStringAction("lexer", match =>
            {
                match.Token = Symbols.LEXER;
                lexer.PushState(States.FACTORY_SECTION);
            }
                                  ,
                                  States.GRAMMAR);

            lexer.AddStringAction("options", match =>
            {
                match.Token = Symbols.OPTIONS;
                lexer.PushState(States.OPTIONS_SECTION);
            }, States.GRAMMAR);

            lexer.AddStringAction(";", match =>
            {
                match.Token = Symbols.SEMI;
                lexer.PopState();
            }, States.OPTIONS_SECTION, States.FACTORY_SECTION);
            lexer.AddStringRule("terminals", Symbols.TERMINALS, States.GRAMMAR);
            lexer.AddStringRule("var", Symbols.VAR, States.GRAMMAR);
            lexer.AddStringRule("types", Symbols.TYPES, States.GRAMMAR);
            lexer.AddStringRule("patterns", Symbols.PATTERNS, States.GRAMMAR);
            lexer.AddStringRule("tokens", Symbols.TOKENS, States.GRAMMAR);
            lexer.AddStringRule(new[] { Symbols.TOKENS }, "int", Symbols.INT, States.GRAMMAR);
            lexer.AddStringRule(new[] { Symbols.STATES }, "int", Symbols.INT, States.GRAMMAR);
            lexer.AddStringRule(new[] { Symbols.LEXER }, "override", Symbols.OVERRIDE, States.FACTORY_SECTION);
            lexer.AddStringRule(new[] { Symbols.PARSER }, "override", Symbols.OVERRIDE, States.FACTORY_SECTION);
            lexer.AddStringRule("states", Symbols.STATES, States.GRAMMAR);
            lexer.AddStringRule("precedence", Symbols.PRECEDENCE, States.GRAMMAR);
            lexer.AddStringRule("parsing", Symbols.PARSING, States.GRAMMAR);
            lexer.AddStringRule("scanning", Symbols.SCANNING, States.GRAMMAR);
            lexer.AddStringRule("end", Symbols.END, States.GRAMMAR);
            lexer.AddStringRule("%EOF", Symbols.EOF_ACTION, States.GRAMMAR);
            lexer.AddRegexRule("%empty", Symbols.EMPTY, States.GRAMMAR);
            lexer.AddRegexRule("%mark", Symbols.MARK, States.GRAMMAR);
            lexer.AddRegexRule(IdentifierPattern, Symbols.IDENTIFIER, States.GRAMMAR, States.OPTIONS_SECTION, States.FACTORY_SECTION);
            lexer.AddStringRule(";", Symbols.SEMI, States.GRAMMAR);
            lexer.AddStringRule(":", Symbols.COLON, States.GRAMMAR, States.FACTORY_SECTION);
            lexer.AddStringRule("=", Symbols.EQ, States.GRAMMAR);
            lexer.AddStringRule("->", Symbols.RARROW, States.GRAMMAR);
            lexer.AddStringRule("|", Symbols.PIPE, States.GRAMMAR);
            lexer.AddStringRule("?", Symbols.QUESTION_MARK, States.GRAMMAR);
            lexer.AddStringRule(".", Symbols.DOT, States.GRAMMAR, States.FACTORY_SECTION);
            lexer.AddStringRule("[", Symbols.LBRACKET, States.GRAMMAR);
            lexer.AddStringRule("]", Symbols.RBRACKET, States.GRAMMAR);
            lexer.AddStringRule("(", Symbols.LPAREN, States.GRAMMAR);
            lexer.AddStringRule(")", Symbols.RPAREN, States.GRAMMAR);
            lexer.AddStringRule("<", Symbols.LANGLE, States.GRAMMAR, States.FACTORY_SECTION);
            lexer.AddStringRule(">", Symbols.RANGLE, States.GRAMMAR, States.FACTORY_SECTION);
            lexer.AddStringRule("*", Symbols.ASTERISK, States.GRAMMAR);
            lexer.AddStringRule("+", Symbols.PLUS, States.GRAMMAR, States.OPTIONS_SECTION, States.FACTORY_SECTION);
            lexer.AddStringRule("++", Symbols.PLUSPLUS, States.GRAMMAR);
            lexer.AddStringRule("+?", Symbols.PLUS_OPT, States.GRAMMAR);
            lexer.AddStringRule("-", Symbols.MINUS, States.GRAMMAR, States.OPTIONS_SECTION, States.FACTORY_SECTION);
            lexer.AddStringRule("^", Symbols.ACCENT, States.GRAMMAR);
            lexer.AddStringRule("#", Symbols.HASH, States.GRAMMAR);
            lexer.AddStringRule("@", Symbols.AT, States.GRAMMAR);

            // ----- strings and characters in code --------------------------------------------
            // we are just rewriting input controlling when the string ends, so we can
            // analyze real C# code properly (in short we want to know if we are in C# code, or in C# string)
            lexer.AddStringAction(@"\\", match => str_buf.Append(match.Text),
                                  States.STR_CODE, States.CHAR_CODE);
            lexer.AddStringAction("\\\"", match => str_buf.Append(match.Text),
                                  States.STR_CODE, States.CHAR_CODE);
            lexer.AddStringAction("\\\'", match => str_buf.Append(match.Text),
                                  States.STR_CODE, States.CHAR_CODE);
            lexer.AddStringAction("\"\"", match => str_buf.Append(match.Text),
                                  States.VERBATIM_STR_CODE);

            lexer.AddStringAction("'", match => // start character in code
            {
                str_buf.Append(match.Text);
                lexer.PushState(States.CHAR_CODE);
            }, States.CODE_BLOCK, States.CODE_EXPR, States.IN_CODE_MACRO);
            lexer.AddStringAction("\"", match => // start string in code
            {
                str_buf.Append(match.Text);
                lexer.PushState(States.STR_CODE);
            }, States.CODE_BLOCK, States.CODE_EXPR, States.IN_CODE_MACRO);
            lexer.AddStringAction("@\"", match => // start verbatim string in code
            {
                str_buf.Append(match.Text);
                lexer.PushState(States.VERBATIM_STR_CODE);
            }, States.CODE_BLOCK, States.CODE_EXPR, States.IN_CODE_MACRO);
            lexer.AddStringAction("\"", match =>  // end string in code
            {
                str_buf.Append(match.Text);
                lexer.PopState();
            }, States.STR_CODE, States.VERBATIM_STR_CODE);
            lexer.AddStringAction("'", match =>  // end character in code
            {
                str_buf.Append(match.Text);
                lexer.PopState();
            }, States.CHAR_CODE);
            // ----- string and regex common --------------------------------------------
            lexer.AddRegexAction("\r|\n", match => { match.Token = Symbols.Error; match.Value = "New line not allowed inside a string/regex"; },
                                 States.REGEX_GRAMMAR, States.STR_CODE, States.STR_GRAMMAR, States.CHAR_CODE);
            // two backslashes
            lexer.AddStringAction(@"\\", match => lex_pattern.AddSpecial(match.Text, @"\"),
                                  States.STR_GRAMMAR, States.REGEX_GRAMMAR);

            // ----- anything else for string in code ------------------------------------
            lexer.AddRegexAction(".", match => str_buf.Append(match.Text), States.STR_CODE, States.VERBATIM_STR_CODE, States.CHAR_CODE);

            // ----- string --------------------------------------------------------------
            lexer.AddStringAction("\"", match => // start string in grammar
            {
                lex_pattern = new LexPattern(LexPattern.TypeEnum.String);
                lexer.PushState(States.STR_GRAMMAR);
            }, States.GRAMMAR);

            lexer.AddStringAction("\"", match => // end string in grammar
            {
                match.Value = lex_pattern.SetStringComparison(StringCaseComparison.Sensitive);
                lex_pattern = null;
                match.Token = Symbols.STRING;
                lexer.PopState();
            }, States.STR_GRAMMAR);
            lexer.AddStringAction("\"i", match => // end string in grammar
            {
                match.Value = lex_pattern.SetStringComparison(StringCaseComparison.Insensitive);
                lex_pattern = null;
                match.Token = Symbols.STRING;
                lexer.PopState();
            }, States.STR_GRAMMAR);

            //http://msdn.microsoft.com/en-us/library/aa691087%28v=vs.71%29.aspx
            //http://msdn.microsoft.com/en-us/library/aa664669%28v=vs.71%29.aspx
            //http://blogs.msdn.com/b/csharpfaq/archive/2004/03/12/what-character-escape-sequences-are-available.aspx
            lexer.AddStringAction(@"\0", match => lex_pattern.AddSpecial(match.Text, "\0"), States.STR_GRAMMAR);
            lexer.AddStringAction(@"\a", match => lex_pattern.AddSpecial(match.Text, "\a"), States.STR_GRAMMAR);
            lexer.AddStringAction(@"\b", match => lex_pattern.AddSpecial(match.Text, "\b"), States.STR_GRAMMAR);
            lexer.AddStringAction(@"\f", match => lex_pattern.AddSpecial(match.Text, "\f"), States.STR_GRAMMAR);
            lexer.AddStringAction(@"\n", match => lex_pattern.AddSpecial(match.Text, "\n"), States.STR_GRAMMAR);
            lexer.AddStringAction(@"\r", match => lex_pattern.AddSpecial(match.Text, "\r"), States.STR_GRAMMAR);
            lexer.AddStringAction(@"\t", match => lex_pattern.AddSpecial(match.Text, "\t"), States.STR_GRAMMAR);
            lexer.AddStringAction(@"\v", match => lex_pattern.AddSpecial(match.Text, "\v"), States.STR_GRAMMAR);
            lexer.AddStringAction(@"\'", match => lex_pattern.AddSpecial(match.Text, "\'"), States.STR_GRAMMAR);
            lexer.AddStringAction(@"\""", match => lex_pattern.AddSpecial(match.Text, "\""), States.STR_GRAMMAR);
            //http://msdn.microsoft.com/en-us/library/bb311038.aspx
            lexer.AddRegexAction("\\\\x[0-9a-fA-F]{1,4}",
                                 match => lex_pattern.AddHexCode(match.Text, match.Text.Substring(2)),
                                 //States.STR_CODE,
                                 States.STR_GRAMMAR);
            lexer.AddRegexAction("\\\\u[0-9a-fA-F]{4}",
                                 match => lex_pattern.AddHexCode(match.Text, match.Text.Substring(2)),
                                 //States.STR_CODE,
                                 States.STR_GRAMMAR);
            lexer.AddRegexAction("\\\\U[0-9a-fA-F]{8}",
                                 match => lex_pattern.AddHexCode(match.Text, match.Text.Substring(2)),
                                 //States.STR_CODE,
                                 States.STR_GRAMMAR);

            lexer.AddRegexAction(@"\.", match => { match.Token = Symbols.Error; match.Value = "Unrecognized escape sequence \"" + match.Text.Substring(1) + "\""; }, States.STR_GRAMMAR);
            lexer.AddStringAction(@"\", match => { match.Token = Symbols.Error; match.Value = "Empty escape sequence"; }, States.STR_GRAMMAR);
            lexer.AddRegexAction(".", match => lex_pattern.Add(match.Text), States.STR_GRAMMAR);

            // ----- regex --------------------------------------------------------------

            lexer.AddStringAction("/", match => // regex start
            {
                lex_pattern = new LexPattern(LexPattern.TypeEnum.Regex);
                lexer.PushState(States.REGEX_GRAMMAR);
            }, States.GRAMMAR);
            lexer.AddStringAction("/", match => // regex end
            {
                match.Value = lex_pattern.SetStringComparison(StringCaseComparison.Sensitive);
                lex_pattern = null;
                match.Token = Symbols.REGEX;
                lexer.PopState();
            }, States.REGEX_GRAMMAR);
            lexer.AddStringAction("/i", match => // regex end
            {
                match.Value = lex_pattern.SetStringComparison(StringCaseComparison.Insensitive);
                lex_pattern = null;
                match.Token = Symbols.REGEX;
                lexer.PopState();
            }, States.REGEX_GRAMMAR);

            // backslash and end-of-regex
            lexer.AddStringAction(@"\/", match => lex_pattern.Add(match.Text.Substring(1)), States.REGEX_GRAMMAR);
            lexer.AddRegexAction(".", match => lex_pattern.Add(match.Text), States.REGEX_GRAMMAR);

            // ---- whitespaces --------------------------
            // keep the variable for macro clean from whitespaces
            lexer.AddRegexAction(new [] { Symbols.LMACRO }, whitespace_pattern, _ => { }, States.IN_CODE_MACRO);
            lexer.AddRegexAction(whitespace_pattern, match => str_buf.Append(match.Text), States.CODE_BLOCK, States.CODE_EXPR, States.IN_CODE_MACRO);
            lexer.AddRegexAction(whitespace_pattern, _ => { }, States.GRAMMAR, States.OPTIONS_SECTION, States.FACTORY_SECTION);
            // ----------- macros ----------------------

            lexer.AddStringAction("$(", match =>
            {
                lexer.PrependToken(Symbols.CODE_SNIPPET, new CodeSnippet(str_buf.ToString(), code_statement));
                code_statement = false;
                str_buf.Clear();

                match.Token = Symbols.LMACRO;
                lexer.PushState(States.IN_CODE_MACRO);
            }, States.CODE_BLOCK, States.IN_CODE_MACRO);

            lexer.AddStringAction(":", match =>
            {
                if (lexer.NestingCounter > 0)
                {
                    str_buf.Append(match.Text);
                }
                else
                {
                    // keeeping macro variable clean
                    if (str_buf.ToString().Trim().Length > 0)
                    {
                        lexer.PrependToken(Symbols.CODE_SNIPPET, new CodeSnippet(str_buf.ToString(), code_statement));
                    }

                    str_buf.Clear();
                    code_statement = false;
                    match.Token    = Symbols.COLON;
                }
            }, States.IN_CODE_MACRO);

            lexer.AddStringAction("(", match =>
            {
                str_buf.Append(match.Text);
                ++lexer.NestingCounter;
            }, States.IN_CODE_MACRO);
            lexer.AddStringAction(")", match =>
            {
                if (lexer.NestingCounter > 0)
                {
                    str_buf.Append(match.Text);
                    --lexer.NestingCounter;
                }
                else
                {
                    // keeping macro variable clean
                    if (str_buf.ToString().Trim().Length > 0)
                    {
                        lexer.PrependToken(Symbols.CODE_SNIPPET, new CodeSnippet(str_buf.ToString(), code_statement));
                    }

                    str_buf.Clear();
                    code_statement = false;
                    match.Token    = Symbols.RMACRO;
                    lexer.PopState();
                }
            }, States.IN_CODE_MACRO);
            // ----------- expressions ----------------------

            // this is not 100% correct, because after COMMA can be LBRACE, so this is not CODE_EXPR but CODE_BLOCK
            // so we have to fix this later
            lexer.AddStringAction(new [] { Symbols.RARROW, Symbols.IDENTIFIER }, ",", match =>
            {
                match.Token = Symbols.COMMA;
                lexer.PushState(States.CODE_EXPR);
                str_buf        = new StringBuilder();
                code_statement = false;
            }, States.GRAMMAR);
            // not an expression, but it has to be below the more restricted COMMA rule
            lexer.AddStringRule(",", Symbols.COMMA, States.GRAMMAR, States.OPTIONS_SECTION, States.FACTORY_SECTION);

            lexer.AddStringAction(";", match =>
            {
                lexer.PopState();
                lexer.PrependToken(Symbols.CODE_SNIPPET, new CodeSnippet(str_buf.ToString(), false));
                match.Token = Symbols.SEMI;
                str_buf     = null;
            }, States.CODE_EXPR);
            // ----------- code block ----------------------

            lexer.AddStringAction("(", match => // start code block
            {
                match.Token = Symbols.LPAREN;
                lexer.PushState(States.CODE_BLOCK);
                str_buf        = new StringBuilder();
                code_statement = false;
            }, States.FACTORY_SECTION);
            lexer.AddStringAction("{", match => // start code block
            {
                match.Token = Symbols.LBRACE;
                lexer.PushState(States.CODE_BLOCK);
                str_buf        = new StringBuilder();
                code_statement = false;
            }, States.GRAMMAR, States.FACTORY_SECTION);
            lexer.AddStringAction(new [] { Symbols.RARROW, Symbols.IDENTIFIER, Symbols.COMMA },
                                  "{", match => // start code block -- this is the correction of the previous too eager switch to CODE_EXPR
            {
                match.Token = Symbols.LBRACE;
                lexer.PopState(); // remove the previous CODE_EXPR
                lexer.PushState(States.CODE_BLOCK);
                str_buf        = new StringBuilder();
                code_statement = false;
            }, States.CODE_EXPR);

            lexer.AddStringAction("{", match =>
            {
                str_buf.Append(match.Text);
                lexer.PushState(States.CODE_BLOCK);
            }, States.CODE_BLOCK);
            lexer.AddStringAction("(", match =>
            {
                str_buf.Append(match.Text);
                lexer.PushState(States.CODE_BLOCK);
            }, States.CODE_BLOCK);

            lexer.AddStringAction("}", match =>
            {
                lexer.PopState();
                if (lexer.State != States.CODE_BLOCK)
                {
                    lexer.PrependToken(Symbols.CODE_SNIPPET, new CodeSnippet(str_buf.ToString(), code_statement));
                    str_buf     = null;
                    match.Token = Symbols.RBRACE;
                }
                else
                {
                    str_buf.Append(match.Text);
                }
            }, States.CODE_BLOCK);
            lexer.AddStringAction(")", match =>
            {
                lexer.PopState();
                if (lexer.State != States.CODE_BLOCK)
                {
                    lexer.PrependToken(Symbols.CODE_SNIPPET, new CodeSnippet(str_buf.ToString(), code_statement));
                    str_buf     = null;
                    match.Token = Symbols.RPAREN;
                }
                else
                {
                    str_buf.Append(match.Text);
                }
            }, States.CODE_BLOCK);
            // identifier with dollar sign ("$") in front
            lexer.AddRegexAction("\\" + CodePiece.PlaceholderSigil + IdentifierPattern, match =>
            {
                if (str_buf.Length > 0)
                {
                    lexer.PrependToken(Symbols.CODE_SNIPPET, new CodeSnippet(str_buf.ToString(), code_statement));
                }
                str_buf.Clear();
                code_statement = false;

                match.Value = match.Text.Substring(1);
                match.Token = Symbols.CODE_PLACEHOLDER;
            }, States.CODE_BLOCK, States.CODE_EXPR, States.IN_CODE_MACRO);
            lexer.AddRegexAction(IdentifierPattern, match =>
            {
                if (str_buf.Length > 0)
                {
                    lexer.PrependToken(Symbols.CODE_SNIPPET, new CodeSnippet(str_buf.ToString(), code_statement));
                }
                str_buf.Clear();
                code_statement = false;
                match.Token    = Symbols.IDENTIFIER;
            }, States.CODE_BLOCK, States.CODE_EXPR, States.IN_CODE_MACRO);

            lexer.AddStringAction(";", match => { code_statement = true; str_buf.Append(match.Text); }, States.CODE_BLOCK);
            lexer.AddRegexAction(".", match => str_buf.Append(match.Text), States.CODE_BLOCK, States.CODE_EXPR, States.IN_CODE_MACRO);

            // ---- comments -----------------------------------------
            lexer.AddStringAction("/*", _ => lexer.PushState(States.COMMENT), States.GRAMMAR, States.OPTIONS_SECTION, States.CODE_BLOCK, States.COMMENT, States.CODE_EXPR, States.IN_CODE_MACRO, States.FACTORY_SECTION);
            lexer.AddStringAction("*/", _ => lexer.PopState(), States.COMMENT);
            lexer.AddRegexAction(".|\n|\r", _ => { }, States.COMMENT);

            lexer.AddStringAction("*/", match => { match.Value = "Unmatched */"; match.Token = Symbols.Error; }, States.GRAMMAR, States.OPTIONS_SECTION, States.CODE_BLOCK, States.CODE_EXPR, States.IN_CODE_MACRO, States.FACTORY_SECTION);

            // single-liners comment
            lexer.AddRegexAction("//.*\n", _ => { }, States.GRAMMAR, States.OPTIONS_SECTION, States.CODE_BLOCK, States.CODE_EXPR, States.IN_CODE_MACRO, States.FACTORY_SECTION);

            // -------------------------------------------------------

            lexer.EofAction = match =>
            {
                if (!lexer.IsValidEofState)
                {
                    match.Value = "Invalid state at EOF";
                    match.Token = Symbols.Error;
                }
                else
                {
                    match.Token = Symbols.EOF;
                }
            };

            return(lexer);
        }