// Tokenise a single line, and also report on how indented this line is private TokenList TokeniseLine(string context, string input, out int lineIndentation, int lineNumber) { // The tokens we found on this line var tokens = new TokenList(); // Replace tabs with four spaces input = input.Replace("\t", " "); bool freeTextMode = false; // Find any whitespace at the start of a line var initialIndentRule = new Regex("^\\s+"); // If there's whitespace at the start of the line, this line is indented if (initialIndentRule.IsMatch(input)) { // Record how indented this line is lineIndentation = initialIndentRule.Match(input).Length; } else { // There's no whitespace at the start of the line, // so this line's indentation level is zero. lineIndentation = 0; } // Keeps track of how much of the line we have left to parse int columnNumber = lineIndentation; // Are we at the start of the line? ie do we disregard token rules that // can't be at the start of a line? bool startOfLine = true; // While we have text left to parse in this line.. while (columnNumber < input.Length) { // Keep track of whether we successfully found a rule to parse the next token var matched = false; // Check each rule to see if it matches foreach (var tokenRule in tokenRules) { // Is the next chunk of text a token? if (tokenRule.regex != null) { // Attempt to match this var match = tokenRule.regex.Match(input, columnNumber); // Bail out if this either failed to match, or matched // further out in the text if (match.Success == false || match.Index > columnNumber) { continue; } // Bail out if this is the first token and we aren't allowed to // match this rule at the start if (tokenRule.canBeginLine == false && startOfLine == true) { continue; } // Bail out if this match was zero-length if (match.Length == 0) { continue; } switch (tokenRule.freeTextMode) { case TokenRule.FreeTextMode.Begins: freeTextMode = true; break; case TokenRule.FreeTextMode.Ends: freeTextMode = false; break; case TokenRule.FreeTextMode.DoNotMatch: if (freeTextMode == true) { // Do not match, UNLESS we should make an exception if (tokenRule.freeTextModeException == null || tokenRule.freeTextModeException(tokens) == false) { continue; } } break; } // Record the token only if we care // about it (ie it's not whitespace) if (tokenRule.discard == false) { Token token; // If this token type's rule had a capture group, // store that if (match.Captures.Count > 0) { token = new Token(tokenRule.type, match.Captures[0].Value); } else { token = new Token(tokenRule.type); } // If this was a string, lop off the quotes at the start and // end, and un-escape the quotes and slashes if (token.type == TokenType.String) { string processedString = token.value as string; processedString = processedString.Substring(1, processedString.Length - 2); processedString = processedString.Replace("\\\\", "\\"); processedString = processedString.Replace("\\\"", "\""); token.value = processedString; } // Record where the token was found token.lineNumber = lineNumber; token.columnNumber = columnNumber; token.context = context; // Add it to the token stream tokens.Add(token); // If this is a token that 'resets' the fact // that we're at the start of the line (like '->' does), // then record that; otherwise, record that we're past // the start of the line and are now allowed to start // matching additional types of tokens if (tokenRule.resetsLine) { startOfLine = true; } else { startOfLine = false; } } // We've advanced through the string columnNumber += match.Length; // Record that we successfully found a type for this token matched = true; // We've matched a token type, stop trying to // match it against others break; } } if (matched == false) { // We've exhausted the list of possible token types, so we've // failed to interpret this string! Bail out! throw new InvalidOperationException("Failed to interpret token " + input); } } // Merge multiple runs of text on the same line var tokensToReturn = new TokenList(); foreach (var token in tokens) { Token lastToken = null; // Did we previously add a token? if (tokensToReturn.Count > 0) { lastToken = tokensToReturn [tokensToReturn.Count - 1]; } // Was the last token in tokensToReturn a Text, AND // this token is a Text? if (lastToken != null && lastToken.type == TokenType.Text && token.type == TokenType.Text) { // Merge the texts! var str = (string)tokensToReturn[tokensToReturn.Count - 1].value; str += (string)token.value; lastToken.value = str; } else { tokensToReturn.Add(token); } } // Attach text between << and >> to the << token for (int i = 0; i < tokensToReturn.Count; i++) { if (i == tokensToReturn.Count - 1) { // don't bother checking if we're the last token in the line continue; } var startToken = tokensToReturn[i]; if (startToken.type == TokenType.BeginCommand) { int startIndex = tokensToReturn[i + 1].columnNumber; int endIndex = -1; // Find the next >> token for (int j = i; j < tokensToReturn.Count; j++) { var endToken = tokensToReturn [j]; if (endToken.type == TokenType.EndCommand) { endIndex = endToken.columnNumber; break; } } if (endIndex != -1) { var text = input.Substring(startIndex, endIndex - startIndex); startToken.associatedRawText = text; } } } // Return the list of tokens we found return(tokensToReturn); }
TokenList TokeniseLine(string line, int lineNumber) { var lineTokens = new Stack <Token> (); // Replace tabs with four spaces line = line.Replace("\t", " "); // Strip out \r's line = line.Replace("\r", ""); // Record the indentation level if the previous state wants us to var thisIndentation = LineIndentation(line); var previousIndentation = indentationStack.Peek(); if (shouldTrackNextIndentation && thisIndentation > previousIndentation.Key) { // If we are more indented than before, emit an // indent token and record this new indent level indentationStack.Push(new KeyValuePair <int, bool>(thisIndentation, true)); var indent = new Token(TokenType.Indent, lineNumber, previousIndentation.Key); indent.value = "".PadLeft(thisIndentation - previousIndentation.Key); shouldTrackNextIndentation = false; lineTokens.Push(indent); } else if (thisIndentation < previousIndentation.Key) { // If we are less indented, emit a dedent for every // indentation level that we passed on the way back to 0 that also // emitted an indentation token. // at the same time, remove those indent levels from the stack while (thisIndentation < indentationStack.Peek().Key) { var topLevel = indentationStack.Pop(); if (topLevel.Value) { var dedent = new Token(TokenType.Dedent, lineNumber, 0); lineTokens.Push(dedent); } } } // Now that we're past any initial indentation, start // finding tokens. int columnNumber = thisIndentation; var whitespace = new Regex(@"\s*"); while (columnNumber < line.Length) { // If we're about to hit a line comment, abort processing line // immediately if (line.Substring(columnNumber).StartsWith(LINE_COMMENT)) { break; } var matched = false; foreach (var rule in currentState.tokenRules) { var match = rule.regex.Match(line, columnNumber); if (match.Success == false || match.Length == 0) { continue; } string tokenText; if (rule.type == TokenType.Text) { // if this is text, then back up to the most recent text // delimiting token, and treat everything from there as // the text. // we do this because we don't want this: // <<flip Harley3 +1>> // to get matched as this: // BeginCommand Identifier("flip") Text("Harley3 +1") EndCommand // instead, we want to match it as this: // BeginCommand Text("flip Harley3 +1") EndCommand int textStartIndex = thisIndentation; if (lineTokens.Count > 0) { while (lineTokens.Peek().type == TokenType.Identifier) { lineTokens.Pop(); } var startDelimiterToken = lineTokens.Peek(); textStartIndex = startDelimiterToken.columnNumber; if (startDelimiterToken.type == TokenType.Indent) { textStartIndex += startDelimiterToken.value.Length; } if (startDelimiterToken.type == TokenType.Dedent) { textStartIndex = thisIndentation; } } columnNumber = textStartIndex; var textEndIndex = match.Index + match.Length; tokenText = line.Substring(textStartIndex, textEndIndex - textStartIndex); } else { tokenText = match.Value; } columnNumber += tokenText.Length; // If this was a string, lop off the quotes at the start and // end, and un-escape the quotes and slashes if (rule.type == TokenType.String) { tokenText = tokenText.Substring(1, tokenText.Length - 2); tokenText = tokenText.Replace(@"\\", @"\"); tokenText = tokenText.Replace(@"\""", @""""); } var token = new Token(rule.type, lineNumber, columnNumber, tokenText); token.delimitsText = rule.delimitsText; lineTokens.Push(token); if (rule.entersState != null) { if (states.ContainsKey(rule.entersState) == false) { throw new TokeniserException(lineNumber, columnNumber, "Unknown tokeniser state " + rule.entersState); } EnterState(states [rule.entersState]); if (shouldTrackNextIndentation == true) { if (indentationStack.Peek().Key < thisIndentation) { indentationStack.Push(new KeyValuePair <int, bool>(thisIndentation, false)); } } } matched = true; break; } if (matched == false) { throw TokeniserException.ExpectedTokensFromState(lineNumber, columnNumber, currentState); } // consume any lingering whitespace before the next token var lastWhitespace = whitespace.Match(line, columnNumber); if (lastWhitespace != null) { columnNumber += lastWhitespace.Length; } } var listToReturn = new TokenList(lineTokens.ToArray()); listToReturn.Reverse(); return(listToReturn); }
// Given an input string, parse it and return the list of tokens public TokenList Tokenise(string context, string input) { // The total collection of all tokens in this input var tokens = new TokenList(); // Start by chopping up the input into lines var lines = input.Split(new char[] { '\n', '\r' }, StringSplitOptions.None); // Keep track of which column each new indent started var indentLevels = new Stack <int>(); // Start at indent 0 indentLevels.Push(0); var lineNum = 0; foreach (var line in lines) { int newIndentLevel; // Get the tokens, plus the indentation level of this line var lineTokens = TokeniseLine(context, line, out newIndentLevel, lineNum); if (newIndentLevel > indentLevels.Peek()) { // We are now more indented than the last indent. // Emit a "indent" token, and push this new indent onto the stack. var indent = new Token(TokenType.Indent); indent.lineNumber = lineNum; indent.context = context; tokens.Add(indent); indentLevels.Push(newIndentLevel); } else if (newIndentLevel < indentLevels.Peek()) { // We are less indented than the last indent. // We may have dedented more than a single indent level, though, so // check this against all indent levels we know about while (newIndentLevel < indentLevels.Peek()) { // We've gone down an indent, holy crap, dedent it! var dedent = new Token(TokenType.Dedent); dedent.lineNumber = lineNum; tokens.Add(dedent); dedent.context = context; indentLevels.Pop(); } } // Add the list of tokens that were in this line tokens.AddRange(lineTokens); // Update line number lineNum++; } // Dedent if there's any indentations left (ie we reached the // end of the file and it was still indented) // (we stop at the second-last one because we pushed 'indent 0' at the start, // and popping that would emit an unbalanced dedent token while (indentLevels.Count > 1) { indentLevels.Pop(); var dedent = new Token(TokenType.Dedent); dedent.lineNumber = lineNum; dedent.context = context; tokens.Add(dedent); } // Finish up with an ending token tokens.Add(new Token(TokenType.EndOfInput)); // yay we're done return(tokens); }