public TokenList Tokenise(string title, string text) { // Do some initial setup indentationStack = new Stack <KeyValuePair <int, bool> > (); indentationStack.Push(new KeyValuePair <int, bool>(0, false)); shouldTrackNextIndentation = false; var tokens = new TokenList(); currentState = defaultState; // Parse each line var lines = new List <string>(text.Split('\n')); // Add a blank line to ensure that we end with zero indentation lines.Add(""); int lineNumber = 1; foreach (var line in lines) { tokens.AddRange(this.TokeniseLine(line, lineNumber)); lineNumber++; } var endOfInput = new Token(TokenType.EndOfInput, lineNumber, 0); tokens.Add(endOfInput); return(tokens); }
// Tokenise a single line, and also report on how indented this line is private TokenList TokeniseLine(string context, string input, out int lineIndentation, int lineNumber) { // The tokens we found on this line var tokens = new TokenList(); // Replace tabs with four spaces input = input.Replace("\t", " "); bool freeTextMode = false; // Find any whitespace at the start of a line var initialIndentRule = new Regex("^\\s+"); // If there's whitespace at the start of the line, this line is indented if (initialIndentRule.IsMatch(input)) { // Record how indented this line is lineIndentation = initialIndentRule.Match(input).Length; } else { // There's no whitespace at the start of the line, // so this line's indentation level is zero. lineIndentation = 0; } // Keeps track of how much of the line we have left to parse int columnNumber = lineIndentation; // Are we at the start of the line? ie do we disregard token rules that // can't be at the start of a line? bool startOfLine = true; // While we have text left to parse in this line.. while (columnNumber < input.Length) { // Keep track of whether we successfully found a rule to parse the next token var matched = false; // Check each rule to see if it matches foreach (var tokenRule in tokenRules) { // Is the next chunk of text a token? if (tokenRule.regex != null) { // Attempt to match this var match = tokenRule.regex.Match(input, columnNumber); // Bail out if this either failed to match, or matched // further out in the text if (match.Success == false || match.Index > columnNumber) { continue; } // Bail out if this is the first token and we aren't allowed to // match this rule at the start if (tokenRule.canBeginLine == false && startOfLine == true) { continue; } // Bail out if this match was zero-length if (match.Length == 0) { continue; } switch (tokenRule.freeTextMode) { case TokenRule.FreeTextMode.Begins: freeTextMode = true; break; case TokenRule.FreeTextMode.Ends: freeTextMode = false; break; case TokenRule.FreeTextMode.DoNotMatch: if (freeTextMode == true) { // Do not match, UNLESS we should make an exception if (tokenRule.freeTextModeException == null || tokenRule.freeTextModeException(tokens) == false) { continue; } } break; } // Record the token only if we care // about it (ie it's not whitespace) if (tokenRule.discard == false) { Token token; // If this token type's rule had a capture group, // store that if (match.Captures.Count > 0) { token = new Token(tokenRule.type, match.Captures[0].Value); } else { token = new Token(tokenRule.type); } // If this was a string, lop off the quotes at the start and // end, and un-escape the quotes and slashes if (token.type == TokenType.String) { string processedString = token.value as string; processedString = processedString.Substring(1, processedString.Length - 2); processedString = processedString.Replace("\\\\", "\\"); processedString = processedString.Replace("\\\"", "\""); token.value = processedString; } // Record where the token was found token.lineNumber = lineNumber; token.columnNumber = columnNumber; token.context = context; // Add it to the token stream tokens.Add(token); // If this is a token that 'resets' the fact // that we're at the start of the line (like '->' does), // then record that; otherwise, record that we're past // the start of the line and are now allowed to start // matching additional types of tokens if (tokenRule.resetsLine) { startOfLine = true; } else { startOfLine = false; } } // We've advanced through the string columnNumber += match.Length; // Record that we successfully found a type for this token matched = true; // We've matched a token type, stop trying to // match it against others break; } } if (matched == false) { // We've exhausted the list of possible token types, so we've // failed to interpret this string! Bail out! throw new InvalidOperationException("Failed to interpret token " + input); } } // Merge multiple runs of text on the same line var tokensToReturn = new TokenList(); foreach (var token in tokens) { Token lastToken = null; // Did we previously add a token? if (tokensToReturn.Count > 0) { lastToken = tokensToReturn [tokensToReturn.Count - 1]; } // Was the last token in tokensToReturn a Text, AND // this token is a Text? if (lastToken != null && lastToken.type == TokenType.Text && token.type == TokenType.Text) { // Merge the texts! var str = (string)tokensToReturn[tokensToReturn.Count - 1].value; str += (string)token.value; lastToken.value = str; } else { tokensToReturn.Add(token); } } // Attach text between << and >> to the << token for (int i = 0; i < tokensToReturn.Count; i++) { if (i == tokensToReturn.Count - 1) { // don't bother checking if we're the last token in the line continue; } var startToken = tokensToReturn[i]; if (startToken.type == TokenType.BeginCommand) { int startIndex = tokensToReturn[i + 1].columnNumber; int endIndex = -1; // Find the next >> token for (int j = i; j < tokensToReturn.Count; j++) { var endToken = tokensToReturn [j]; if (endToken.type == TokenType.EndCommand) { endIndex = endToken.columnNumber; break; } } if (endIndex != -1) { var text = input.Substring(startIndex, endIndex - startIndex); startToken.associatedRawText = text; } } } // Return the list of tokens we found return(tokensToReturn); }
// Given an input string, parse it and return the list of tokens public TokenList Tokenise(string context, string input) { // The total collection of all tokens in this input var tokens = new TokenList(); // Start by chopping up the input into lines var lines = input.Split(new char[] { '\n', '\r' }, StringSplitOptions.None); // Keep track of which column each new indent started var indentLevels = new Stack <int>(); // Start at indent 0 indentLevels.Push(0); var lineNum = 0; foreach (var line in lines) { int newIndentLevel; // Get the tokens, plus the indentation level of this line var lineTokens = TokeniseLine(context, line, out newIndentLevel, lineNum); if (newIndentLevel > indentLevels.Peek()) { // We are now more indented than the last indent. // Emit a "indent" token, and push this new indent onto the stack. var indent = new Token(TokenType.Indent); indent.lineNumber = lineNum; indent.context = context; tokens.Add(indent); indentLevels.Push(newIndentLevel); } else if (newIndentLevel < indentLevels.Peek()) { // We are less indented than the last indent. // We may have dedented more than a single indent level, though, so // check this against all indent levels we know about while (newIndentLevel < indentLevels.Peek()) { // We've gone down an indent, holy crap, dedent it! var dedent = new Token(TokenType.Dedent); dedent.lineNumber = lineNum; tokens.Add(dedent); dedent.context = context; indentLevels.Pop(); } } // Add the list of tokens that were in this line tokens.AddRange(lineTokens); // Update line number lineNum++; } // Dedent if there's any indentations left (ie we reached the // end of the file and it was still indented) // (we stop at the second-last one because we pushed 'indent 0' at the start, // and popping that would emit an unbalanced dedent token while (indentLevels.Count > 1) { indentLevels.Pop(); var dedent = new Token(TokenType.Dedent); dedent.lineNumber = lineNum; dedent.context = context; tokens.Add(dedent); } // Finish up with an ending token tokens.Add(new Token(TokenType.EndOfInput)); // yay we're done return(tokens); }