Example #1
        // Tokenise a single line, and also report on how indented this line is
        private TokenList TokeniseLine(string context, string input, out int lineIndentation, int lineNumber)
            // The tokens we found on this line
            var tokens = new TokenList();

            // Replace tabs with four spaces
            input = input.Replace("\t", "    ");

            bool freeTextMode = false;

            // Find any whitespace at the start of a line
            var initialIndentRule = new Regex("^\\s+");

            // If there's whitespace at the start of the line, this line is indented
            if (initialIndentRule.IsMatch(input))
                // Record how indented this line is
                lineIndentation = initialIndentRule.Match(input).Length;
                // There's no whitespace at the start of the line,
                // so this line's indentation level is zero.
                lineIndentation = 0;

            // Keeps track of how much of the line we have left to parse
            int columnNumber = lineIndentation;

            // Are we at the start of the line? ie do we disregard token rules that
            // can't be at the start of a line?
            bool startOfLine = true;

            // While we have text left to parse in this line..
            while (columnNumber < input.Length)
                // Keep track of whether we successfully found a rule to parse the next token
                var matched = false;

                // Check each rule to see if it matches
                foreach (var tokenRule in tokenRules)
                    // Is the next chunk of text a token?
                    if (tokenRule.regex != null)
                        // Attempt to match this
                        var match = tokenRule.regex.Match(input, columnNumber);

                        // Bail out if this either failed to match, or matched
                        // further out in the text
                        if (match.Success == false || match.Index > columnNumber)

                        // Bail out if this is the first token and we aren't allowed to
                        // match this rule at the start
                        if (tokenRule.canBeginLine == false && startOfLine == true)

                        // Bail out if this match was zero-length
                        if (match.Length == 0)

                        switch (tokenRule.freeTextMode)
                        case TokenRule.FreeTextMode.Begins:
                            freeTextMode = true;

                        case TokenRule.FreeTextMode.Ends:
                            freeTextMode = false;

                        case TokenRule.FreeTextMode.DoNotMatch:
                            if (freeTextMode == true)
                                // Do not match, UNLESS we should make an exception
                                if (tokenRule.freeTextModeException == null ||
                                    tokenRule.freeTextModeException(tokens) == false)

                        // Record the token only if we care
                        // about it (ie it's not whitespace)
                        if (tokenRule.discard == false)
                            Token token;

                            // If this token type's rule had a capture group,
                            // store that
                            if (match.Captures.Count > 0)
                                token = new Token(tokenRule.type, match.Captures[0].Value);
                                token = new Token(tokenRule.type);

                            // If this was a string, lop off the quotes at the start and
                            // end, and un-escape the quotes and slashes
                            if (token.type == TokenType.String)
                                string processedString = token.value as string;
                                processedString = processedString.Substring(1, processedString.Length - 2);

                                processedString = processedString.Replace("\\\\", "\\");
                                processedString = processedString.Replace("\\\"", "\"");
                                token.value     = processedString;

                            // Record where the token was found
                            token.lineNumber   = lineNumber;
                            token.columnNumber = columnNumber;
                            token.context      = context;

                            // Add it to the token stream

                            // If this is a token that 'resets' the fact
                            // that we're at the start of the line (like '->' does),
                            // then record that; otherwise, record that we're past
                            // the start of the line and are now allowed to start
                            // matching additional types of tokens
                            if (tokenRule.resetsLine)
                                startOfLine = true;
                                startOfLine = false;

                        // We've advanced through the string
                        columnNumber += match.Length;

                        // Record that we successfully found a type for this token
                        matched = true;

                        // We've matched a token type, stop trying to
                        // match it against others

                if (matched == false)
                    // We've exhausted the list of possible token types, so we've
                    // failed to interpret this string! Bail out!

                    throw new InvalidOperationException("Failed to interpret token " + input);

            // Merge multiple runs of text on the same line
            var tokensToReturn = new TokenList();

            foreach (var token in tokens)
                Token lastToken = null;

                // Did we previously add a token?
                if (tokensToReturn.Count > 0)
                    lastToken = tokensToReturn [tokensToReturn.Count - 1];

                // Was the last token in tokensToReturn a Text, AND
                // this token is a Text?
                if (lastToken != null &&
                    lastToken.type == TokenType.Text &&
                    token.type == TokenType.Text)
                    // Merge the texts!
                    var str = (string)tokensToReturn[tokensToReturn.Count - 1].value;
                    str            += (string)token.value;
                    lastToken.value = str;

            // Attach text between << and >> to the << token
            for (int i = 0; i < tokensToReturn.Count; i++)
                if (i == tokensToReturn.Count - 1)
                    // don't bother checking if we're the last token in the line
                var startToken = tokensToReturn[i];
                if (startToken.type == TokenType.BeginCommand)
                    int startIndex = tokensToReturn[i + 1].columnNumber;
                    int endIndex   = -1;
                    // Find the next >> token
                    for (int j = i; j < tokensToReturn.Count; j++)
                        var endToken = tokensToReturn [j];
                        if (endToken.type == TokenType.EndCommand)
                            endIndex = endToken.columnNumber;

                    if (endIndex != -1)
                        var text = input.Substring(startIndex, endIndex - startIndex);
                        startToken.associatedRawText = text;

            // Return the list of tokens we found
Example #2
        TokenList TokeniseLine(string line, int lineNumber)
            var lineTokens = new Stack <Token> ();

            // Replace tabs with four spaces
            line = line.Replace("\t", "    ");

            // Strip out \r's
            line = line.Replace("\r", "");

            // Record the indentation level if the previous state wants us to

            var thisIndentation     = LineIndentation(line);
            var previousIndentation = indentationStack.Peek();

            if (shouldTrackNextIndentation && thisIndentation > previousIndentation.Key)
                // If we are more indented than before, emit an
                // indent token and record this new indent level
                indentationStack.Push(new KeyValuePair <int, bool>(thisIndentation, true));

                var indent = new Token(TokenType.Indent, lineNumber, previousIndentation.Key);
                indent.value = "".PadLeft(thisIndentation - previousIndentation.Key);

                shouldTrackNextIndentation = false;

            else if (thisIndentation < previousIndentation.Key)
                // If we are less indented, emit a dedent for every
                // indentation level that we passed on the way back to 0 that also
                // emitted an indentation token.
                // at the same time, remove those indent levels from the stack

                while (thisIndentation < indentationStack.Peek().Key)
                    var topLevel = indentationStack.Pop();

                    if (topLevel.Value)
                        var dedent = new Token(TokenType.Dedent, lineNumber, 0);

            // Now that we're past any initial indentation, start
            // finding tokens.
            int columnNumber = thisIndentation;

            var whitespace = new Regex(@"\s*");

            while (columnNumber < line.Length)
                // If we're about to hit a line comment, abort processing line
                // immediately
                if (line.Substring(columnNumber).StartsWith(LINE_COMMENT))

                var matched = false;

                foreach (var rule in currentState.tokenRules)
                    var match = rule.regex.Match(line, columnNumber);

                    if (match.Success == false || match.Length == 0)

                    string tokenText;

                    if (rule.type == TokenType.Text)
                        // if this is text, then back up to the most recent text
                        // delimiting token, and treat everything from there as
                        // the text.
                        // we do this because we don't want this:
                        //    <<flip Harley3 +1>>
                        // to get matched as this:
                        //    BeginCommand Identifier("flip") Text("Harley3 +1") EndCommand
                        // instead, we want to match it as this:
                        //    BeginCommand Text("flip Harley3 +1") EndCommand

                        int textStartIndex = thisIndentation;

                        if (lineTokens.Count > 0)
                            while (lineTokens.Peek().type == TokenType.Identifier)

                            var startDelimiterToken = lineTokens.Peek();
                            textStartIndex = startDelimiterToken.columnNumber;
                            if (startDelimiterToken.type == TokenType.Indent)
                                textStartIndex += startDelimiterToken.value.Length;
                            if (startDelimiterToken.type == TokenType.Dedent)
                                textStartIndex = thisIndentation;

                        columnNumber = textStartIndex;

                        var textEndIndex = match.Index + match.Length;

                        tokenText = line.Substring(textStartIndex, textEndIndex - textStartIndex);
                        tokenText = match.Value;

                    columnNumber += tokenText.Length;

                    // If this was a string, lop off the quotes at the start and
                    // end, and un-escape the quotes and slashes
                    if (rule.type == TokenType.String)
                        tokenText = tokenText.Substring(1, tokenText.Length - 2);

                        tokenText = tokenText.Replace(@"\\", @"\");
                        tokenText = tokenText.Replace(@"\""", @"""");

                    var token = new Token(rule.type, lineNumber, columnNumber, tokenText);

                    token.delimitsText = rule.delimitsText;


                    if (rule.entersState != null)
                        if (states.ContainsKey(rule.entersState) == false)
                            throw new TokeniserException(lineNumber, columnNumber, "Unknown tokeniser state " + rule.entersState);
                        EnterState(states [rule.entersState]);

                        if (shouldTrackNextIndentation == true)
                            if (indentationStack.Peek().Key < thisIndentation)
                                indentationStack.Push(new KeyValuePair <int, bool>(thisIndentation, false));

                    matched = true;


                if (matched == false)
                    throw TokeniserException.ExpectedTokensFromState(lineNumber, columnNumber, currentState);

                // consume any lingering whitespace before the next token
                var lastWhitespace = whitespace.Match(line, columnNumber);
                if (lastWhitespace != null)
                    columnNumber += lastWhitespace.Length;

            var listToReturn = new TokenList(lineTokens.ToArray());


Example #3
        // Given an input string, parse it and return the list of tokens
        public TokenList Tokenise(string context, string input)
            // The total collection of all tokens in this input
            var tokens = new TokenList();

            // Start by chopping up the input into lines
            var lines = input.Split(new char[] { '\n', '\r' }, StringSplitOptions.None);

            // Keep track of which column each new indent started
            var indentLevels = new Stack <int>();

            // Start at indent 0

            var lineNum = 0;

            foreach (var line in lines)
                int newIndentLevel;

                // Get the tokens, plus the indentation level of this line
                var lineTokens = TokeniseLine(context, line, out newIndentLevel, lineNum);

                if (newIndentLevel > indentLevels.Peek())
                    // We are now more indented than the last indent.
                    // Emit a "indent" token, and push this new indent onto the stack.
                    var indent = new Token(TokenType.Indent);
                    indent.lineNumber = lineNum;
                    indent.context    = context;
                else if (newIndentLevel < indentLevels.Peek())
                    // We are less indented than the last indent.
                    // We may have dedented more than a single indent level, though, so
                    // check this against all indent levels we know about

                    while (newIndentLevel < indentLevels.Peek())
                        // We've gone down an indent, holy crap, dedent it!
                        var dedent = new Token(TokenType.Dedent);
                        dedent.lineNumber = lineNum;
                        dedent.context = context;

                // Add the list of tokens that were in this line

                // Update line number

            // Dedent if there's any indentations left (ie we reached the
            // end of the file and it was still indented)
            // (we stop at the second-last one because we pushed 'indent 0' at the start,
            // and popping that would emit an unbalanced dedent token
            while (indentLevels.Count > 1)
                var dedent = new Token(TokenType.Dedent);
                dedent.lineNumber = lineNum;
                dedent.context    = context;

            // Finish up with an ending token
            tokens.Add(new Token(TokenType.EndOfInput));

            // yay we're done