/// <summary> /// Fetches the next token from the source string. /// </summary> /// <returns>The next token discovered in the input string.</returns> public Token GetNextToken() { if (position < input.Length) { int matchLength = 0; int symbolCode = -2; int endOfLine; // HEADER_CODE, the code that is to be put at the beginning of the // generated source file if (previousSymbolCode == GrammarParser.CODE_HEADER) { symbolCode = GrammarParser.CODE_HEADERCODE; var endOfHeaderMatch = re_endOfHeader.Match(input, position); if (!endOfHeaderMatch.Success) matchLength = input.Length - position; else matchLength = endOfHeaderMatch.Index - position; // REGEX, a regular expression defining a token } else if (previousSymbolCode == GrammarParser.CODE_EQUALS) { symbolCode = GrammarParser.CODE_REGEX; endOfLine = input.IndexOf('\n', position); if (endOfLine == -1) matchLength = input.Length - position; else matchLength = endOfLine - position; } else switch (input[position]) { case '<': symbolCode = GrammarParser.CODE_LANGLE; matchLength = 1; break; case '>': symbolCode = GrammarParser.CODE_RANGLE; matchLength = 1; break; case '.': symbolCode = GrammarParser.CODE_DOT; matchLength = 1; break; case '|': symbolCode = GrammarParser.CODE_OR; matchLength = 1; break; case '=': symbolCode = GrammarParser.CODE_EQUALS; matchLength = 1; break; case ':': if ((input[position + 1] == ':') && (input[position + 2] == '=')) { symbolCode = GrammarParser.CODE_DERIVES; matchLength = 3; } else { reportParsingError(string.Format("Unexpected character '{0}' following ':'.", input[position + 1])); } break; case '"': symbolCode = GrammarParser.CODE_QUOTED; int endOfQuotes = input.IndexOf('"', position + 1); if (endOfQuotes == -1) matchLength = input.Length - position; else matchLength = (endOfQuotes + 1) - position; break; case '(': var match = re_regexOpts.Match(input, position); if (match.Index == position) { symbolCode = GrammarParser.CODE_REGEXOPTS; matchLength = match.Length; } else { reportParsingError(string.Format("\"{0}\" is not a proper regular expression option setter.")); } break; case '#': endOfLine = input.IndexOf('\n', position + 1); symbolCode = GrammarParser.CODE_SKIP; if (endOfLine == -1) matchLength = input.Length - position; else matchLength = (endOfLine + 1) - position; break; case '%': var keyword = re_identifier.Match(input, position + 1); switch (keyword.Value) { case "header": symbolCode = GrammarParser.CODE_HEADER; break; case "lexer": symbolCode = GrammarParser.CODE_LEXER; break; case "null": symbolCode = GrammarParser.CODE_NULL; break; case "parser": symbolCode = GrammarParser.CODE_PARSER; break; case "start": symbolCode = GrammarParser.CODE_START; break; case "type": symbolCode = GrammarParser.CODE_TYPE; break; case "userobject": symbolCode = GrammarParser.CODE_USEROBJECT; break; default: reportParsingError(string.Format("Unknown keyword \"%{0}\".", keyword.Value)); break; } matchLength = 1 + keyword.Length; break; case '{': int blockDepth = 1; int blockPosition = position + 1; while ((blockDepth > 0) && (blockPosition < input.Length)) { switch (input[blockPosition]) { case '{': blockDepth++; blockPosition++; break; case '}': blockDepth--; blockPosition++; break; case '/': if (input[blockPosition + 1] == '/') { endOfLine = input.IndexOf('\n', blockPosition + 2); if (endOfLine == -1) { blockPosition = input.Length; } else { blockPosition = endOfLine + 1; } } else if (input[blockPosition + 1] == '*') { int endOfComment = input.IndexOf("*/", blockPosition + 2); if (endOfComment == -1) { blockPosition = input.Length; } else { blockPosition = endOfComment + 2; } } else { blockPosition++; } break; case '\'': case '"': char quote = input[blockPosition]; Regex quoteContents = quote == '"' ? re_doubleQuoteContents : re_singleQuoteContents; int literalLength = quoteContents.Match(input, blockPosition + 1).Length; blockPosition += 1 + literalLength + 1; break; default: blockPosition++; break; } } symbolCode = GrammarParser.CODE_CODE; matchLength = blockPosition - position; break; default: var whitespace_match = re_whitespace.Match(input, position); if (whitespace_match.Index == position) { symbolCode = GrammarParser.CODE_SKIP; matchLength = whitespace_match.Length; } else { var identifier_match = re_identifier.Match(input, position); if (identifier_match.Index == position) { symbolCode = GrammarParser.CODE_IDENTIFIER; matchLength = identifier_match.Length; } else { reportParsingError(string.Format("Unexpected character '{0}'.", input[position])); } } break; } Token token = new Token(); token.LineNumber = lineNumber; token.ColumnNumber = columnNumber; token.SymbolCode = symbolCode; token.Value = input.Substring(position, matchLength); //přepočítáme "lidskou" adresu v textu (namatchované řetězce můžou být víceřádkové) int lastNewLine = token.Value.LastIndexOf('\n'); if (lastNewLine == -1) { columnNumber += matchLength; } else { columnNumber = token.Value.Length - lastNewLine; foreach (char c in token.Value) if (c == '\n') lineNumber++; } position += matchLength; previousSymbolCode = symbolCode; if (token.SymbolCode == -1) { return GetNextToken(); } else { return token; } } else { Token finalToken = new Token(); finalToken.LineNumber = lineNumber; finalToken.ColumnNumber = columnNumber; //token == $end finalToken.SymbolCode = GrammarParser.CODE_END; finalToken.Value = ""; hasTokens = false; previousSymbolCode = 0; return finalToken; } }
/// <summary> /// Fetches the next token from the source string. /// </summary> /// <returns>The next token discovered in the input string.</returns> public Token GetNextToken() { if (position < input.Length) { Match match = regex.Match(input, position); Token token = new Token(); token.LineNumber = lineNumber; token.ColumnNumber = columnNumber; //pokud regex nenamatchuje řetězec začínající na aktuální pozici, museli jsme narazit //na něco, co není popsáno v definicích tokenů a oznámíme to výjimkou if (!match.Success || (match.Index > position)) { string unexpectedCharacter = null; switch (input[position]) { case '\n': unexpectedCharacter = "\\n"; break; case '\r': unexpectedCharacter = "\\r"; break; case '\t': unexpectedCharacter = "\\t"; break; default: unexpectedCharacter = input[position].ToString(); break; } throw new ParsingException(string.Format("Unexpected character '{0}' encountered by the lexer.", unexpectedCharacter), lineNumber, columnNumber); } //regex obsahuje capture groupy s názvy __0, __1..., groupy jsou seřazeny tak, že ty s nižším //číslem mají přednost (dokumentovaná vlastnost regexího operátoru | ) //když u nějaké groupy najdeme match, podíváme se do groupSymbolCodes, o jaký terminál se jedná for (int i = 0; i < groupSymbolCodes.Count; i++) { if (match.Groups["__" + i.ToString()].Success) { token.SymbolCode = groupSymbolCodes[i]; token.Value = match.Value; break; } } position += match.Length; //přepočítáme "lidskou" adresu v textu (namatchované řetězce můžou být víceřádkové) int lastNewLine = match.Value.LastIndexOf('\n'); if (lastNewLine == -1) { columnNumber += match.Length; } else { columnNumber = match.Length - lastNewLine; foreach (char c in match.Value) if (c == '\n') lineNumber++; } if (token.SymbolCode == -1) return GetNextToken(); else return token; } else { Token finalToken = new Token(); finalToken.LineNumber = lineNumber; finalToken.ColumnNumber = columnNumber; //token == $end finalToken.SymbolCode = 0; finalToken.Value = ""; hasTokens = false; return finalToken; } }
public static Grammar ParseGrammar(string specificationPath, out IList <string> warningMessages) { #if BOOTSTRAP GrammarLexer lexer = new GrammarLexer(); lexer.SourceString = File.ReadAllText(specificationPath); Token token = lexer.GetNextToken(); //sem si budeme ukládat chyby a warningy; pokud se vyskytne nějaká chyba, čteme dál a //až na konci vyhodíme výjimku se všemi chybami a warningami; pokud vše proběhne bez //závažnějších chyb, tak seznam warningů pošlem zpátky volajícímu List <string> errorMessages = new List <string>(); warningMessages = new List <string>(); //the code to be inserted at the start of the generated code string headerCode = null; //seznam jmen všech symbolů; slouží potom jako převodní tabulka z kódu symbolu na jeho jméno List <string> symbolNames = new List <string>(); //"inverzní tabulka" k symbolNames, která nám pro jméno symbolu řekne jeho kód Dictionary <string, int> symbolCodes = new Dictionary <string, int>(); //seznam regulárních výrazů definujících terminální symboly; netvoříme z nich rovnou výsledný //lexerův regex, ale ukládáme si je zvlášť, abychom v případě chyby při kompilaci celkového regexu //mohli jednodušše otestovat, které výrazy jsou na vině List <string> regexes = new List <string>(); //pro každý výraz v regexes si pamatujeme pozici, kde jsme ho našli, abychom mohli vydat podrobnější //zprávu List <int> regexLines = new List <int>(); List <int> regexColumns = new List <int>(); //pro každý výraz v regexes si také pamatujeme kód symbolu, který je popisován oním výrazem, //v případě, že výraz má matchovat řetězce, které chceme ignorovat, je v tomto poli hodnota -1; //jedná se o runtime data, která pak přímo používá náš lexer List <int> groupSymbolCodes = new List <int>(); //výsledný regulární výraz, pomocí kterého lexer scanuje tokeny; druhá část runtime dat pro náš lexer Regex regex = null; //jméno pseudoterminálu, jehož tokeny se nemají posílat parseru, ale zahazovat string nullTerminal = ""; //globální optiony .NETímu regex stroji (case insensitive, multiline...) string regexOpts = null; if (token.SymbolCode == CODE_HEADER) { token = lexer.GetNextToken(); headerCode = token.Value; token = lexer.GetNextToken(); } // skipping LEXER token = lexer.GetNextToken(); if (token.SymbolCode == CODE_NULL) { token = lexer.GetNextToken(); nullTerminal = token.Value; token = lexer.GetNextToken(); } if (token.SymbolCode == CODE_REGEXOPTS) { regexOpts = token.Value; token = lexer.GetNextToken(); } symbolNames.Add("$end"); symbolCodes["$end"] = 0; while (token.SymbolCode == CODE_IDENTIFIER) { string symbol = token.Value; token = lexer.GetNextToken(); // skipping EQUALS token = lexer.GetNextToken(); string capturingRegex = token.Value; token = lexer.GetNextToken(); if (symbol == nullTerminal) { groupSymbolCodes.Add(-1); } else { if (!symbolCodes.ContainsKey(symbol)) { symbolNames.Add(symbol); symbolCodes[symbol] = symbolNames.Count - 1; } groupSymbolCodes.Add(symbolCodes[symbol]); } regexes.Add(capturingRegex); regexLines.Add(token.LineNumber); regexColumns.Add(token.ColumnNumber); } StringBuilder pattern = new StringBuilder(); if (regexOpts != null) { pattern.Append(regexOpts); } for (int i = 0; i < regexes.Count; i++) { //všechny uživatelovi regulární výrazy oddělíme ořítky a zapíšeme je v pořadí, v jakém nám //je zadal (v .NETím Regex enginu mají výrazy v ořítku víc nalevo přednost) a každý výraz //strčíme do capture groupy pojmenované __i, kde i je pořadové číslo výrazu, počítáno od 0 if (i != 0) { pattern.Append('|'); } pattern.AppendFormat("(?<{0}>{1})", "__" + i.ToString(), regexes[i]); } try { regex = new Regex(pattern.ToString(), RegexOptions.Compiled); } catch (ArgumentException) { try { new Regex(regexOpts); } catch (ArgumentException) { // FIXME: We no longer have the line and column data on regexOpts. errorMessages.Add(string.Format("{0},{1}: The RegEx options are invalid.", -1, -1)); } for (int i = 0; i < regexes.Count; i++) { try { new Regex(regexes[i]); } catch (ArgumentException) { errorMessages.Add(string.Format("{0},{1}: This regular expression is invalid.", regexLines[i], regexColumns[i])); } } } int numTerminals = symbolNames.Count; // skipping PARSER token = lexer.GetNextToken(); //neterminály, které se objevily na levé straně nějakého pravidla HashSet <int> reducibleNonterminals = new HashSet <int>(); //neterminály, které se objevily na pravé straně nějakého pravidla HashSet <int> usedNonterminals = new HashSet <int>(); bool[] terminalUsed = new bool[numTerminals]; List <ProductionWithAction> productions = new List <ProductionWithAction>(); symbolNames.Add("$start"); symbolCodes["$start"] = numTerminals; //semhle dáme <$start> výjimečně, protože ani nechceme, //aby ho někdo dával na pravou stranu nějakého pravidla usedNonterminals.Add(symbolCodes["$start"]); // skipping START token = lexer.GetNextToken(); // skipping LANGLE token = lexer.GetNextToken(); string startSymbol = token.Value; if (symbolCodes.ContainsKey(startSymbol) && symbolCodes[startSymbol] < numTerminals) { errorMessages.Add(string.Format("{0},{1}: The nonterminal <{2}> shares it's name with a terminal symbol.", token.LineNumber, token.ColumnNumber, startSymbol)); } token = lexer.GetNextToken(); // skipping RANGLE token = lexer.GetNextToken(); symbolNames.Add(startSymbol); symbolCodes[startSymbol] = symbolNames.Count - 1; string userObjectType = null; if (token.SymbolCode == CODE_USEROBJECT) { token = lexer.GetNextToken(); if (token.SymbolCode == CODE_QUOTED) { userObjectType = token.Value.Substring(1, token.Value.Length - 2); token = lexer.GetNextToken(); } else { StringBuilder typeBuilder = new StringBuilder(token.Value); token = lexer.GetNextToken(); while (token.SymbolCode == CODE_DOT) { typeBuilder.Append("."); token = lexer.GetNextToken(); typeBuilder.Append(token.Value); token = lexer.GetNextToken(); } userObjectType = typeBuilder.ToString(); } } //naše 0. pravidlo, které výstižně popisuje způsob, jakým si gramatiku upravujeme productions.Add(new ProductionWithAction(new Production( symbolCodes["$start"], new int[] { symbolCodes[startSymbol], symbolCodes["$end"] }), "{ return _1; }")); reducibleNonterminals.Add(symbolCodes["$start"]); usedNonterminals.Add(symbolCodes[startSymbol]); terminalUsed[symbolCodes["$end"]] = true; var typeMappings = new Dictionary <string, string>(); //zpracování pravidel while (token.SymbolCode != CODE_END) { if (token.SymbolCode == CODE_TYPE) { token = lexer.GetNextToken(); // skipping LANGLE token = lexer.GetNextToken(); string nonterminal = token.Value; if (symbolCodes.ContainsKey(nonterminal) && symbolCodes[nonterminal] < numTerminals) { errorMessages.Add(string.Format("{0},{1}: The nonterminal <{2}> shares it's name with a terminal symbol.", token.LineNumber, token.ColumnNumber, nonterminal)); } if (!symbolCodes.ContainsKey(nonterminal)) { symbolNames.Add(nonterminal); symbolCodes[nonterminal] = symbolNames.Count - 1; } token = lexer.GetNextToken(); // skipping RANGLE token = lexer.GetNextToken(); if (token.SymbolCode == CODE_QUOTED) { // QUOTED typeMappings.Add(nonterminal, token.Value.Substring(1, token.Value.Length - 2)); token = lexer.GetNextToken(); } else { StringBuilder typeBuilder = new StringBuilder(token.Value); token = lexer.GetNextToken(); while (token.SymbolCode == CODE_DOT) { typeBuilder.Append("."); token = lexer.GetNextToken(); typeBuilder.Append(token.Value); token = lexer.GetNextToken(); } typeMappings.Add(nonterminal, typeBuilder.ToString()); } } else { //extrahujeme symbol na levé straně a zpracujeme ho // skipping LANGLE token = lexer.GetNextToken(); string lhsSymbol = token.Value; if (symbolCodes.ContainsKey(lhsSymbol) && symbolCodes[lhsSymbol] < numTerminals) { errorMessages.Add(string.Format("{0},{1}: The nonterminal <{2}> shares it's name with a terminal symbol.", token.LineNumber, token.ColumnNumber, lhsSymbol)); } if (!symbolCodes.ContainsKey(lhsSymbol)) { symbolNames.Add(lhsSymbol); symbolCodes[lhsSymbol] = symbolNames.Count - 1; } if (!reducibleNonterminals.Contains(symbolCodes[lhsSymbol])) { reducibleNonterminals.Add(symbolCodes[lhsSymbol]); } token = lexer.GetNextToken(); //skipping RANGLE token = lexer.GetNextToken(); int lhsSymbolCode = symbolCodes[lhsSymbol]; //Zpracujeme výraz na pravé straně, který může sestávat z několika seznamů symbolů oddělenými //ořítky. Každý z těchto seznamů pak tvoří jedno pravidlo bez ořítek. while ((token.SymbolCode == CODE_DERIVES) || (token.SymbolCode == CODE_OR)) { token = lexer.GetNextToken(); List <int> rhsSymbols = new List <int>(); while (token.SymbolCode != CODE_CODE) { int rhsSymbolCode = -1; if (token.SymbolCode == CODE_LANGLE) { //skipping LANGLE token = lexer.GetNextToken(); string rhsSymbol = token.Value; if (symbolCodes.ContainsKey(rhsSymbol) && symbolCodes[rhsSymbol] < numTerminals) { errorMessages.Add(string.Format("{0},{1}: The nonterminal <{2}> shares it's name with a terminal symbol.", token.LineNumber, token.ColumnNumber, rhsSymbol)); } if (!symbolCodes.ContainsKey(rhsSymbol)) { symbolNames.Add(rhsSymbol); symbolCodes[rhsSymbol] = symbolNames.Count - 1; } if (!usedNonterminals.Contains(symbolCodes[rhsSymbol])) { usedNonterminals.Add(symbolCodes[rhsSymbol]); } token = lexer.GetNextToken(); //skipping RANGLE token = lexer.GetNextToken(); rhsSymbolCode = symbolCodes[rhsSymbol]; } else { string rhsSymbol = token.Value; if (!symbolCodes.ContainsKey(rhsSymbol)) { errorMessages.Add(string.Format("{0},{1}: The terminal '{2}' is used but not defined.", token.LineNumber, token.ColumnNumber, rhsSymbol)); } else { rhsSymbolCode = symbolCodes[rhsSymbol]; terminalUsed[rhsSymbolCode] = true; } token = lexer.GetNextToken(); } rhsSymbols.Add(rhsSymbolCode); } string code = token.Value; token = lexer.GetNextToken(); productions.Add(new ProductionWithAction(new Production(lhsSymbolCode, rhsSymbols), code)); } } } //ToArray voláme proto, aby se líná metoda Intersect vyhodnotila a nedošlo by pak při vykonávání //dalšího příkazu k chybě int[] theGoodOnes = usedNonterminals.Intersect(reducibleNonterminals).ToArray(); usedNonterminals.ExceptWith(theGoodOnes); reducibleNonterminals.ExceptWith(theGoodOnes); foreach (int nonterminal in usedNonterminals) { warningMessages.Add(string.Format("Warning: The nonterminal <{0}> isn't reducible.", symbolNames[nonterminal])); } foreach (int nonterminal in reducibleNonterminals) { warningMessages.Add(string.Format("Warning: The nonterminal <{0}> is defined but never used.", symbolNames[nonterminal])); } for (int terminal = 0; terminal < numTerminals; terminal++) { if (!terminalUsed[terminal]) { warningMessages.Add(string.Format("Warning: The terminal '{0}' is defined but never used.", symbolNames[terminal])); } } if (errorMessages.Count > 0) { throw new InvalidSpecificationException(errorMessages.Concat(warningMessages)); } //už máme vše načte a zkontrolováno, teď už jen setřídíme pravidla podle levé strany, //přečíslujeme je a pro každý neterminál dopočítáme indexy, na kterých začínají pravidla //s daným neterminálem Production[] productionsArray = new Production[productions.Count]; productionsArray[0] = productions[0].Production; string[] actions = new string[productions.Count]; actions[0] = productions[0].Action; IEnumerable <ProductionWithAction> sortedProductions = productions.GetRange(1, productions.Count - 1).OrderBy((prod => prod.Production.LHSSymbol)); int k = 1; foreach (ProductionWithAction productionWithAction in sortedProductions) { productionsArray[k] = productionWithAction.Production; productionsArray[k].ProductionCode = k; actions[k] = productionWithAction.Action; k++; } int numNonterminals = symbolCodes.Count - numTerminals; int[] nonterminalProductionOffset = new int[numNonterminals + 1]; int offset = 0; for (int nonterminal = 0; nonterminal < numNonterminals; nonterminal++) { nonterminalProductionOffset[nonterminal] = offset; while ((offset < productionsArray.Length) && (productionsArray[offset].LHSSymbol == numTerminals + nonterminal)) { offset++; } } nonterminalProductionOffset[nonterminalProductionOffset.Length - 1] = offset; string[] nonterminalTypes = new string[numNonterminals]; foreach (var typeMapping in typeMappings) { nonterminalTypes[symbolCodes[typeMapping.Key] - numTerminals] = typeMapping.Value; } //a teď už to jen zabalíme a pošleme GrammarDefinition grammarDefinition = new GrammarDefinition(symbolNames.ToArray(), productionsArray, nonterminalProductionOffset, numTerminals); LexerData lexerData = new LexerData(regex, groupSymbolCodes); GrammarCode grammarCode = new GrammarCode(headerCode, actions, nonterminalTypes, userObjectType); Grammar grammar = new Grammar(grammarDefinition, lexerData, grammarCode); return(grammar); #else LexerData lexerData; ParserData parserData; Grammar.ReadRuntimeDataFromStream( new MemoryStream(YetAnotherParserGenerator.Properties.Resources.SpecificationGrammar), out lexerData, out parserData); GrammarLexer lexer = new GrammarLexer(); Parser parser = new Parser(parserData); GrammarParserLocals locals = new GrammarParserLocals(specificationPath, out warningMessages); lexer.SourceString = File.ReadAllText(specificationPath); return((Grammar)parser.Parse(lexer, locals)); #endif }