Exemple #1
0
        public TokenList Tokenise(string title, string text)
        {
            // Do some initial setup
            indentationStack = new Stack <KeyValuePair <int, bool> > ();
            indentationStack.Push(new KeyValuePair <int, bool>(0, false));
            shouldTrackNextIndentation = false;

            var tokens = new TokenList();

            currentState = defaultState;

            // Parse each line
            var lines = new List <string>(text.Split('\n'));

            // Add a blank line to ensure that we end with zero indentation
            lines.Add("");

            int lineNumber = 1;

            foreach (var line in lines)
            {
                tokens.AddRange(this.TokeniseLine(line, lineNumber));
                lineNumber++;
            }

            var endOfInput = new Token(TokenType.EndOfInput, lineNumber, 0);

            tokens.Add(endOfInput);

            return(tokens);
        }
Exemple #2
0
        // Tokenise a single line, and also report on how indented this line is
        private TokenList TokeniseLine(string context, string input, out int lineIndentation, int lineNumber)
        {
            // The tokens we found on this line
            var tokens = new TokenList();

            // Replace tabs with four spaces
            input = input.Replace("\t", "    ");

            bool freeTextMode = false;

            // Find any whitespace at the start of a line
            var initialIndentRule = new Regex("^\\s+");

            // If there's whitespace at the start of the line, this line is indented
            if (initialIndentRule.IsMatch(input))
            {
                // Record how indented this line is
                lineIndentation = initialIndentRule.Match(input).Length;
            }
            else
            {
                // There's no whitespace at the start of the line,
                // so this line's indentation level is zero.
                lineIndentation = 0;
            }

            // Keeps track of how much of the line we have left to parse
            int columnNumber = lineIndentation;

            // Are we at the start of the line? ie do we disregard token rules that
            // can't be at the start of a line?
            bool startOfLine = true;

            // While we have text left to parse in this line..
            while (columnNumber < input.Length)
            {
                // Keep track of whether we successfully found a rule to parse the next token
                var matched = false;

                // Check each rule to see if it matches
                foreach (var tokenRule in tokenRules)
                {
                    // Is the next chunk of text a token?
                    if (tokenRule.regex != null)
                    {
                        // Attempt to match this
                        var match = tokenRule.regex.Match(input, columnNumber);

                        // Bail out if this either failed to match, or matched
                        // further out in the text
                        if (match.Success == false || match.Index > columnNumber)
                        {
                            continue;
                        }

                        // Bail out if this is the first token and we aren't allowed to
                        // match this rule at the start
                        if (tokenRule.canBeginLine == false && startOfLine == true)
                        {
                            continue;
                        }

                        // Bail out if this match was zero-length
                        if (match.Length == 0)
                        {
                            continue;
                        }

                        switch (tokenRule.freeTextMode)
                        {
                        case TokenRule.FreeTextMode.Begins:
                            freeTextMode = true;
                            break;

                        case TokenRule.FreeTextMode.Ends:
                            freeTextMode = false;
                            break;

                        case TokenRule.FreeTextMode.DoNotMatch:
                            if (freeTextMode == true)
                            {
                                // Do not match, UNLESS we should make an exception
                                if (tokenRule.freeTextModeException == null ||
                                    tokenRule.freeTextModeException(tokens) == false)
                                {
                                    continue;
                                }
                            }
                            break;
                        }

                        // Record the token only if we care
                        // about it (ie it's not whitespace)
                        if (tokenRule.discard == false)
                        {
                            Token token;

                            // If this token type's rule had a capture group,
                            // store that
                            if (match.Captures.Count > 0)
                            {
                                token = new Token(tokenRule.type, match.Captures[0].Value);
                            }
                            else
                            {
                                token = new Token(tokenRule.type);
                            }

                            // If this was a string, lop off the quotes at the start and
                            // end, and un-escape the quotes and slashes
                            if (token.type == TokenType.String)
                            {
                                string processedString = token.value as string;
                                processedString = processedString.Substring(1, processedString.Length - 2);

                                processedString = processedString.Replace("\\\\", "\\");
                                processedString = processedString.Replace("\\\"", "\"");
                                token.value     = processedString;
                            }

                            // Record where the token was found
                            token.lineNumber   = lineNumber;
                            token.columnNumber = columnNumber;
                            token.context      = context;

                            // Add it to the token stream
                            tokens.Add(token);

                            // If this is a token that 'resets' the fact
                            // that we're at the start of the line (like '->' does),
                            // then record that; otherwise, record that we're past
                            // the start of the line and are now allowed to start
                            // matching additional types of tokens
                            if (tokenRule.resetsLine)
                            {
                                startOfLine = true;
                            }
                            else
                            {
                                startOfLine = false;
                            }
                        }

                        // We've advanced through the string
                        columnNumber += match.Length;

                        // Record that we successfully found a type for this token
                        matched = true;

                        // We've matched a token type, stop trying to
                        // match it against others
                        break;
                    }
                }

                if (matched == false)
                {
                    // We've exhausted the list of possible token types, so we've
                    // failed to interpret this string! Bail out!

                    throw new InvalidOperationException("Failed to interpret token " + input);
                }
            }

            // Merge multiple runs of text on the same line
            var tokensToReturn = new TokenList();

            foreach (var token in tokens)
            {
                Token lastToken = null;

                // Did we previously add a token?
                if (tokensToReturn.Count > 0)
                {
                    lastToken = tokensToReturn [tokensToReturn.Count - 1];
                }

                // Was the last token in tokensToReturn a Text, AND
                // this token is a Text?
                if (lastToken != null &&
                    lastToken.type == TokenType.Text &&
                    token.type == TokenType.Text)
                {
                    // Merge the texts!
                    var str = (string)tokensToReturn[tokensToReturn.Count - 1].value;
                    str            += (string)token.value;
                    lastToken.value = str;
                }
                else
                {
                    tokensToReturn.Add(token);
                }
            }

            // Attach text between << and >> to the << token
            for (int i = 0; i < tokensToReturn.Count; i++)
            {
                if (i == tokensToReturn.Count - 1)
                {
                    // don't bother checking if we're the last token in the line
                    continue;
                }
                var startToken = tokensToReturn[i];
                if (startToken.type == TokenType.BeginCommand)
                {
                    int startIndex = tokensToReturn[i + 1].columnNumber;
                    int endIndex   = -1;
                    // Find the next >> token
                    for (int j = i; j < tokensToReturn.Count; j++)
                    {
                        var endToken = tokensToReturn [j];
                        if (endToken.type == TokenType.EndCommand)
                        {
                            endIndex = endToken.columnNumber;
                            break;
                        }
                    }

                    if (endIndex != -1)
                    {
                        var text = input.Substring(startIndex, endIndex - startIndex);
                        startToken.associatedRawText = text;
                    }
                }
            }

            // Return the list of tokens we found
            return(tokensToReturn);
        }
Exemple #3
0
        // Given an input string, parse it and return the list of tokens
        public TokenList Tokenise(string context, string input)
        {
            // The total collection of all tokens in this input
            var tokens = new TokenList();

            // Start by chopping up the input into lines
            var lines = input.Split(new char[] { '\n', '\r' }, StringSplitOptions.None);

            // Keep track of which column each new indent started
            var indentLevels = new Stack <int>();

            // Start at indent 0
            indentLevels.Push(0);

            var lineNum = 0;

            foreach (var line in lines)
            {
                int newIndentLevel;

                // Get the tokens, plus the indentation level of this line
                var lineTokens = TokeniseLine(context, line, out newIndentLevel, lineNum);

                if (newIndentLevel > indentLevels.Peek())
                {
                    // We are now more indented than the last indent.
                    // Emit a "indent" token, and push this new indent onto the stack.
                    var indent = new Token(TokenType.Indent);
                    indent.lineNumber = lineNum;
                    indent.context    = context;
                    tokens.Add(indent);
                    indentLevels.Push(newIndentLevel);
                }
                else if (newIndentLevel < indentLevels.Peek())
                {
                    // We are less indented than the last indent.
                    // We may have dedented more than a single indent level, though, so
                    // check this against all indent levels we know about

                    while (newIndentLevel < indentLevels.Peek())
                    {
                        // We've gone down an indent, holy crap, dedent it!
                        var dedent = new Token(TokenType.Dedent);
                        dedent.lineNumber = lineNum;
                        tokens.Add(dedent);
                        dedent.context = context;
                        indentLevels.Pop();
                    }
                }

                // Add the list of tokens that were in this line
                tokens.AddRange(lineTokens);

                // Update line number
                lineNum++;
            }


            // Dedent if there's any indentations left (ie we reached the
            // end of the file and it was still indented)
            // (we stop at the second-last one because we pushed 'indent 0' at the start,
            // and popping that would emit an unbalanced dedent token
            while (indentLevels.Count > 1)
            {
                indentLevels.Pop();
                var dedent = new Token(TokenType.Dedent);
                dedent.lineNumber = lineNum;
                dedent.context    = context;
                tokens.Add(dedent);
            }

            // Finish up with an ending token
            tokens.Add(new Token(TokenType.EndOfInput));

            // yay we're done
            return(tokens);
        }