Пример #1
0
        private static TokenSet TokenizeDelimited(Token source)
        {
            string[]    startDelimiters = { "/*", "--", "'", "[" };
            string[]    endDelimiters   = { "*/", "\n", "'", "]" };
            string[]    noMatches       = { null, null, "''", null };
            TokenType[] tokenTypes      = { TokenType.Comment, TokenType.Comment, TokenType.StringValue, TokenType.Identifier };

            string startDelimiter = "START";

            int[]     startFound   = new int[startDelimiters.Length];
            string    endDelimiter = "END";
            string    noMatch      = null;
            TokenType tokenType    = TokenType.Unknown;

            for (int i = 0; i < startDelimiters.Length; i++)
            {
                startFound[i] = source.Value.IndexOf(startDelimiters[i]);
            }

            TokenSet tokens       = new TokenSet();
            int      offset       = 0;
            int      skipped      = 0;
            int      start        = 0;
            bool     tokenStarted = false;

            while (start < source.Value.Length)
            {
                int found = -1;
                if (tokenStarted)
                {
                    found = source.Value.IndexOf(endDelimiter, start + skipped);
                }
                else
                {
                    for (int i = 0; i < startDelimiters.Length; i++)
                    {
                        if (startFound[i] != -1 && startFound[i] < start)
                        {
                            startFound[i] = source.Value.IndexOf(startDelimiters[i], start);
                        }
                    }

                    //These things all escape whatever is found within them, so find the first one
                    for (int i = 0; i < startDelimiters.Length; i++)
                    {
                        int justFound = startFound[i];
                        if (justFound >= 0 && (found < 0 || justFound < found))
                        {
                            found          = justFound;
                            startDelimiter = startDelimiters[i];
                            endDelimiter   = endDelimiters[i];
                            noMatch        = noMatches[i];
                            tokenType      = tokenTypes[i];
                        }
                    }
                }

                string    token;
                TokenType addingType;
                if (found < 0)
                {
                    if (endDelimiter == "\n")
                    {
                        //newlines are actually optional
                        addingType   = tokenStarted ? tokenType : TokenType.Unknown;
                        tokenStarted = false;
                    }
                    else
                    {
                        addingType = TokenType.Unknown;
                    }
                    token = source.Value.Substring(start, source.Value.Length - start);
                    start = source.Value.Length;
                }
                else if (tokenStarted)
                {
                    //don't split on things we should leave in the middle
                    int skipLength = ShouldSkip(source, found, noMatch);
                    if (skipLength > 0)
                    {
                        skipped = found - start + skipLength;
                        continue;
                    }

                    token = source.Value.Substring(start - 1,
                                                   found - start + 1 + endDelimiter.Length);
                    addingType   = tokenType;
                    start        = found + endDelimiter.Length;
                    tokenStarted = false;

                    //do some validation
                    if (!token.StartsWith(startDelimiter) || !token.EndsWith(endDelimiter))
                    {
                        throw new ApplicationException("Bad tokenizing!");
                    }
                }
                else
                {
                    token        = source.Value.Substring(start, found - start);
                    addingType   = TokenType.Unknown;
                    start        = found + 1;
                    tokenStarted = true;
                }
                offset = source.Value.IndexOf(token, offset);
                tokens.Add(new Token(token, addingType, source.StartIndex + offset));
                offset += token.Length;               //don't find the same text twice if it's repeated
                skipped = 0;
            }
            if (tokenStarted)
            {
                throw new ApplicationException("Tokenizer error, found \"" + startDelimiter
                                               + "\" without \"" + endDelimiter + "\".");
            }

            return(tokens);
        }
Пример #2
0
        /// <summary>
        /// Tokenizes the specified script.
        /// </summary>
        /// <param name="script">The script.</param>
        /// <returns></returns>
        public static TokenSet Tokenize(string script)
        {
            parserFile = -1;

            Token    seed  = new Token(script, TokenType.Unknown, 0);
            TokenSet start = new TokenSet();

            start.Add(seed);
            DisplayHTMLParseStep(start, "Starting set", true);

            //pull out tokens for comments, strings, escaped names, etc
            TokenSet escapedTokens = TokenizeDelimited(seed);

            //pull apart everything else on whitespace
            TokenSet whitespaceTokens = new TokenSet();

            foreach (Token token in escapedTokens)
            {
                if (token.Type == TokenType.Unknown)
                {
                    //make life easier by creating optional whitespace
                    string spread = token.Value;
                    foreach (char op in (OPERATORS + ",.();").ToCharArray())
                    {
                        spread = spread.Replace(op.ToString(), " " + op + " ");
                    }
                    int offset = 0;
                    foreach (string piece in spread.Split(" \t\r\n".ToCharArray()))
                    {
                        if (piece.Length == 0)
                        {
                            continue;
                        }

                        offset = token.Value.IndexOf(piece, offset);
                        whitespaceTokens.Add(new Token(piece, TokenType.Unknown, token.StartIndex + offset));
                        offset += piece.Length;                        //don't find the same text twice if it's repeated
                    }
                }
                else
                {
                    whitespaceTokens.Add(token);
                }
            }

            //remove bogus tokens
            TokenSet        finalTokens = new TokenSet();
            TokenEnumerator enumerator  = whitespaceTokens.GetEnumerator();

            while (enumerator.MoveNext())
            {
                //empty tokens
                if (enumerator.Current.Value == "")
                {
                    continue;
                }

                //bogus unicode string markings
                if (enumerator.Current.Value == "N" && enumerator.Next != null && enumerator.Next.Type == TokenType.StringValue)
                {
                    continue;
                }

                finalTokens.Add(enumerator.Current);
            }
            DisplayHTMLParseStep(finalTokens, "After empty tokens removed", false);

            //comments gum things up
            RemoveComments(finalTokens);
            DisplayHTMLParseStep(finalTokens, "After comments removed", false);

            //Categorization
            IdentifySpecialTokens(finalTokens);
            DisplayHTMLParseStep(finalTokens, "After identifying special tokens", false);

            IdentifyRemainingTokens(finalTokens);
            DisplayHTMLParseStep(finalTokens, "After identifying remaining tokens", false);

            //associate the tokens with each other
            finalTokens = CreateTree(finalTokens);
            DisplayHTMLParseStep(finalTokens, "After tree creation", false);

            return(finalTokens);
        }