Beispiel #1
0
 private static void AppendCharAndCompleteToken(TokenizationState state)
 {
     state.ConsumeCurrentCharacterIntoToken();
     CompleteToken(state, false);
 }
Beispiel #2
0
        private static void ProcessOrOpenToken(TokenizationState state)
        {
            if (state.CurrentTokenizationType != null)
            {
                throw new Exception("Cannot start a new Token: existing Tokenization Type is not null");
            }

            if (!state.HasUnprocessedCurrentCharacter)
            {
                throw new Exception("Cannot start a new Token: no (outstanding) current character specified!");
            }

            //start a new value.
            state.CurrentTokenValue.Length = 0;

            if (IsWhitespace(state.CurrentChar))
            {
                state.CurrentTokenizationType = SqlTokenizationType.WhiteSpace;
                state.ConsumeCurrentCharacterIntoToken();
            }
            else if (state.CurrentChar == '-')
            {
                state.CurrentTokenizationType        = SqlTokenizationType.SingleHyphen;
                state.HasUnprocessedCurrentCharacter = false; //purposefully swallowing, will be reinserted later
            }
            else if (state.CurrentChar == '$')
            {
                state.CurrentTokenizationType        = SqlTokenizationType.SingleDollar;
                state.HasUnprocessedCurrentCharacter = false; //purposefully swallowing, will be reinserted later
            }
            else if (state.CurrentChar == '/')
            {
                state.CurrentTokenizationType        = SqlTokenizationType.SingleSlash;
                state.HasUnprocessedCurrentCharacter = false; //purposefully swallowing, will be reinserted later
            }
            else if (state.CurrentChar == 'N')
            {
                state.CurrentTokenizationType        = SqlTokenizationType.SingleN;
                state.HasUnprocessedCurrentCharacter = false; //purposefully swallowing, will be reinserted later except N-string case
            }
            else if (state.CurrentChar == '\'')
            {
                state.CurrentTokenizationType        = SqlTokenizationType.String;
                state.HasUnprocessedCurrentCharacter = false; //purposefully swallowing
            }
            else if (state.CurrentChar == '"')
            {
                state.CurrentTokenizationType        = SqlTokenizationType.QuotedString;
                state.HasUnprocessedCurrentCharacter = false; //purposefully swallowing
            }
            else if (state.CurrentChar == '[')
            {
                state.CurrentTokenizationType        = SqlTokenizationType.BracketQuotedName;
                state.HasUnprocessedCurrentCharacter = false; //purposefully swallowing
            }
            else if (state.CurrentChar == '(')
            {
                SaveCurrentCharToNewToken(state, SqlTokenType.OpenParens);
            }
            else if (state.CurrentChar == ')')
            {
                SaveCurrentCharToNewToken(state, SqlTokenType.CloseParens);
            }
            else if (state.CurrentChar == ',')
            {
                SaveCurrentCharToNewToken(state, SqlTokenType.Comma);
            }
            else if (state.CurrentChar == '.')
            {
                state.CurrentTokenizationType        = SqlTokenizationType.SinglePeriod;
                state.HasUnprocessedCurrentCharacter = false; //purposefully swallowing, will be reinserted later
            }
            else if (state.CurrentChar == '0')
            {
                state.CurrentTokenizationType        = SqlTokenizationType.SingleZero;
                state.HasUnprocessedCurrentCharacter = false; //purposefully swallowing, will be reinserted later
            }
            else if (state.CurrentChar >= '1' && state.CurrentChar <= '9')
            {
                state.CurrentTokenizationType = SqlTokenizationType.Number;
                state.ConsumeCurrentCharacterIntoToken();
            }
            else if (IsCurrencyPrefix(state.CurrentChar))
            {
                state.CurrentTokenizationType = SqlTokenizationType.MonetaryValue;
                state.ConsumeCurrentCharacterIntoToken();
            }
            else if (state.CurrentChar == ';')
            {
                SaveCurrentCharToNewToken(state, SqlTokenType.Semicolon);
            }
            else if (state.CurrentChar == ':')
            {
                SaveCurrentCharToNewToken(state, SqlTokenType.Colon);
            }
            else if (state.CurrentChar == '*')
            {
                state.CurrentTokenizationType        = SqlTokenizationType.SingleAsterisk;
                state.HasUnprocessedCurrentCharacter = false; //purposefully swallowing, will be reinserted later
            }
            else if (state.CurrentChar == '=')
            {
                state.CurrentTokenizationType        = SqlTokenizationType.SingleEquals;
                state.HasUnprocessedCurrentCharacter = false; //purposefully swallowing, will be reinserted later
            }
            else if (state.CurrentChar == '<')
            {
                state.CurrentTokenizationType        = SqlTokenizationType.SingleLT;
                state.HasUnprocessedCurrentCharacter = false; //purposefully swallowing, will be reinserted later
            }
            else if (state.CurrentChar == '>')
            {
                state.CurrentTokenizationType        = SqlTokenizationType.SingleGT;
                state.HasUnprocessedCurrentCharacter = false; //purposefully swallowing, will be reinserted later
            }
            else if (state.CurrentChar == '!')
            {
                state.CurrentTokenizationType        = SqlTokenizationType.SingleExclamation;
                state.HasUnprocessedCurrentCharacter = false; //purposefully swallowing, will be reinserted later
            }
            else if (state.CurrentChar == '|')
            {
                state.CurrentTokenizationType        = SqlTokenizationType.SinglePipe;
                state.HasUnprocessedCurrentCharacter = false; //purposefully swallowing, will be reinserted later
            }
            else if (IsCompoundableOperatorCharacter(state.CurrentChar))
            {
                state.CurrentTokenizationType = SqlTokenizationType.SingleOtherCompoundableOperator;
                state.ConsumeCurrentCharacterIntoToken();
            }
            else if (IsOperatorCharacter(state.CurrentChar))
            {
                SaveCurrentCharToNewToken(state, SqlTokenType.OtherOperator);
            }
            else
            {
                state.CurrentTokenizationType = SqlTokenizationType.OtherNode;
                state.ConsumeCurrentCharacterIntoToken();
            }
        }
Beispiel #3
0
        public ITokenList TokenizeSQL(string inputSQL, long?requestedMarkerPosition)
        {
            var state = new TokenizationState(inputSQL, requestedMarkerPosition);

            state.ReadNextCharacter();
            while (state.HasUnprocessedCurrentCharacter)
            {
                if (state.CurrentTokenizationType == null)
                {
                    ProcessOrOpenToken(state);
                    state.ReadNextCharacter();
                    continue;
                }

                switch (state.CurrentTokenizationType.Value)
                {
                case SqlTokenizationType.WhiteSpace:
                    if (IsWhitespace(state.CurrentChar))
                    {
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    else
                    {
                        CompleteTokenAndProcessNext(state);
                    }
                    break;

                case SqlTokenizationType.SinglePeriod:
                    if (state.CurrentChar >= '0' && state.CurrentChar <= '9')
                    {
                        state.CurrentTokenizationType = SqlTokenizationType.DecimalValue;
                        state.CurrentTokenValue.Append('.');
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    else
                    {
                        state.CurrentTokenValue.Append('.');
                        CompleteTokenAndProcessNext(state);
                    }
                    break;

                case SqlTokenizationType.SingleZero:
                    if (state.CurrentChar == 'x' || state.CurrentChar == 'X')
                    {
                        state.CurrentTokenizationType = SqlTokenizationType.BinaryValue;
                        state.CurrentTokenValue.Append('0');
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    else if (state.CurrentChar >= '0' && state.CurrentChar <= '9')
                    {
                        state.CurrentTokenizationType = SqlTokenizationType.Number;
                        state.CurrentTokenValue.Append('0');
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    else if (state.CurrentChar == '.')
                    {
                        state.CurrentTokenizationType = SqlTokenizationType.DecimalValue;
                        state.CurrentTokenValue.Append('0');
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    else
                    {
                        state.CurrentTokenValue.Append('0');
                        CompleteTokenAndProcessNext(state);
                    }
                    break;

                case SqlTokenizationType.Number:
                    if (state.CurrentChar == 'e' || state.CurrentChar == 'E')
                    {
                        state.CurrentTokenizationType = SqlTokenizationType.FloatValue;
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    else if (state.CurrentChar == '.')
                    {
                        state.CurrentTokenizationType = SqlTokenizationType.DecimalValue;
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    else if (state.CurrentChar >= '0' && state.CurrentChar <= '9')
                    {
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    else
                    {
                        CompleteTokenAndProcessNext(state);
                    }
                    break;

                case SqlTokenizationType.DecimalValue:
                    if (state.CurrentChar == 'e' || state.CurrentChar == 'E')
                    {
                        state.CurrentTokenizationType = SqlTokenizationType.FloatValue;
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    else if (state.CurrentChar >= '0' && state.CurrentChar <= '9')
                    {
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    else
                    {
                        CompleteTokenAndProcessNext(state);
                    }
                    break;

                case SqlTokenizationType.FloatValue:
                    if (state.CurrentChar >= '0' && state.CurrentChar <= '9')
                    {
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    else if ((state.CurrentChar == '-' || state.CurrentChar == '+') && state.CurrentTokenValue.ToString().ToUpper().EndsWith("E"))
                    {
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    else
                    {
                        CompleteTokenAndProcessNext(state);
                    }
                    break;

                case SqlTokenizationType.BinaryValue:
                    if ((state.CurrentChar >= '0' && state.CurrentChar <= '9') ||
                        (state.CurrentChar >= 'A' && state.CurrentChar <= 'F') ||
                        (state.CurrentChar >= 'a' && state.CurrentChar <= 'f')
                        )
                    {
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    else
                    {
                        CompleteTokenAndProcessNext(state);
                    }
                    break;

                case SqlTokenizationType.SingleDollar:
                    state.CurrentTokenValue.Append('$');

                    if ((state.CurrentChar >= 'A' && state.CurrentChar <= 'Z') ||
                        (state.CurrentChar >= 'a' && state.CurrentChar <= 'z')
                        )
                    {
                        state.CurrentTokenizationType = SqlTokenizationType.PseudoName;
                    }
                    else
                    {
                        state.CurrentTokenizationType = SqlTokenizationType.MonetaryValue;
                    }

                    state.ConsumeCurrentCharacterIntoToken();
                    break;

                case SqlTokenizationType.MonetaryValue:
                    if (state.CurrentChar >= '0' && state.CurrentChar <= '9')
                    {
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    else if (state.CurrentChar == '-' && state.CurrentTokenValue.Length == 1)
                    {
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    else if (state.CurrentChar == '.' && !state.CurrentTokenValue.ToString().Contains("."))
                    {
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    else
                    {
                        CompleteTokenAndProcessNext(state);
                    }
                    break;

                case SqlTokenizationType.SingleHyphen:
                    if (state.CurrentChar == '-')
                    {
                        state.CurrentTokenizationType        = SqlTokenizationType.SingleLineComment;
                        state.HasUnprocessedCurrentCharacter = false;     //DISCARDING the hyphen because of weird standard
                    }
                    else if (state.CurrentChar == '=')
                    {
                        state.CurrentTokenizationType = SqlTokenizationType.OtherOperator;
                        state.CurrentTokenValue.Append('-');
                        AppendCharAndCompleteToken(state);
                    }
                    else
                    {
                        state.CurrentTokenizationType = SqlTokenizationType.OtherOperator;
                        state.CurrentTokenValue.Append('-');
                        CompleteTokenAndProcessNext(state);
                    }
                    break;

                case SqlTokenizationType.SingleSlash:
                    if (state.CurrentChar == '*')
                    {
                        state.CurrentTokenizationType        = SqlTokenizationType.BlockComment;
                        state.HasUnprocessedCurrentCharacter = false;     //DISCARDING the asterisk because of weird standard
                        state.CommentNesting++;
                    }
                    else if (state.CurrentChar == '/')
                    {
                        state.CurrentTokenizationType        = SqlTokenizationType.SingleLineCommentCStyle;
                        state.HasUnprocessedCurrentCharacter = false;     //DISCARDING the slash because of weird standard
                    }
                    else if (state.CurrentChar == '=')
                    {
                        state.CurrentTokenizationType = SqlTokenizationType.OtherOperator;
                        state.CurrentTokenValue.Append('/');
                        AppendCharAndCompleteToken(state);
                    }
                    else
                    {
                        state.CurrentTokenizationType = SqlTokenizationType.OtherOperator;
                        state.CurrentTokenValue.Append('/');
                        CompleteTokenAndProcessNext(state);
                    }
                    break;

                case SqlTokenizationType.SingleLineComment:
                case SqlTokenizationType.SingleLineCommentCStyle:
                    if (state.CurrentChar == (char)13 || state.CurrentChar == (char)10)
                    {
                        int nextCharInt = state.InputReader.Peek();
                        if (state.CurrentChar == (char)13 && nextCharInt == 10)
                        {
                            state.ConsumeCurrentCharacterIntoToken();
                            state.ReadNextCharacter();
                        }
                        AppendCharAndCompleteToken(state);
                    }
                    else
                    {
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    break;

                case SqlTokenizationType.BlockComment:
                    if (state.CurrentChar == '*')
                    {
                        if (state.InputReader.Peek() == (int)'/')
                        {
                            state.CommentNesting--;
                            if (state.CommentNesting > 0)
                            {
                                state.ConsumeCurrentCharacterIntoToken();
                                state.ReadNextCharacter();
                                state.ConsumeCurrentCharacterIntoToken();
                            }
                            else
                            {
                                state.HasUnprocessedCurrentCharacter = false;     //discarding the asterisk
                                state.ReadNextCharacter();
                                //TODO: DANGER DANGER why do "contained" token types have this inconsistent handling where the delimiters are not in the value???
                                SwallowOutstandingCharacterAndCompleteToken(state);
                            }
                        }
                        else
                        {
                            state.ConsumeCurrentCharacterIntoToken();
                        }
                    }
                    else
                    {
                        if (state.CurrentChar == '/' && state.InputReader.Peek() == (int)'*')
                        {
                            state.ConsumeCurrentCharacterIntoToken();
                            state.ReadNextCharacter();
                            state.ConsumeCurrentCharacterIntoToken();
                            state.CommentNesting++;
                        }
                        else
                        {
                            state.ConsumeCurrentCharacterIntoToken();
                        }
                    }
                    break;

                case SqlTokenizationType.OtherNode:
                case SqlTokenizationType.PseudoName:
                    if (IsNonWordCharacter(state.CurrentChar))
                    {
                        CompleteTokenAndProcessNext(state);
                    }
                    else
                    {
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    break;

                case SqlTokenizationType.SingleN:
                    if (state.CurrentChar == '\'')
                    {
                        state.CurrentTokenizationType        = SqlTokenizationType.NString;
                        state.HasUnprocessedCurrentCharacter = false;     //DISCARDING the apostrophe because of weird standard
                    }
                    else
                    {
                        if (IsNonWordCharacter(state.CurrentChar))
                        {
                            CompleteTokenAndProcessNext(state);
                        }
                        else
                        {
                            state.CurrentTokenizationType = SqlTokenizationType.OtherNode;
                            state.CurrentTokenValue.Append('N');
                            state.ConsumeCurrentCharacterIntoToken();
                        }
                    }
                    break;

                case SqlTokenizationType.NString:
                case SqlTokenizationType.String:
                    if (state.CurrentChar == '\'')
                    {
                        if (state.InputReader.Peek() == (int)'\'')
                        {
                            //add the character (once)
                            state.ConsumeCurrentCharacterIntoToken();

                            //throw away the second character... because (for some reason?) we're storing the effective value" rather than the raw token...
                            state.DiscardNextCharacter();
                        }
                        else
                        {
                            //TODO: DANGER DANGER why do "contained" token types have this inconsistent handling where the delimiters are not in the value???
                            SwallowOutstandingCharacterAndCompleteToken(state);
                        }
                    }
                    else
                    {
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    break;

                case SqlTokenizationType.QuotedString:
                    if (state.CurrentChar == '"')
                    {
                        if (state.InputReader.Peek() == (int)'"')
                        {
                            //add the character (once)
                            state.ConsumeCurrentCharacterIntoToken();

                            //throw away the second character... because (for some reason?) we're storing the effective value" rather than the raw token...
                            state.DiscardNextCharacter();
                        }
                        else
                        {
                            //TODO: DANGER DANGER why do "contained" token types have this inconsistent handling where the delimiters are not in the value???
                            SwallowOutstandingCharacterAndCompleteToken(state);
                        }
                    }
                    else
                    {
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    break;

                case SqlTokenizationType.BracketQuotedName:
                    if (state.CurrentChar == ']')
                    {
                        if (state.InputReader.Peek() == (int)']')
                        {
                            //add the character (once)
                            state.ConsumeCurrentCharacterIntoToken();

                            //throw away the second character... because (for some reason?) we're storing the effective value" rather than the raw token...
                            state.DiscardNextCharacter();
                        }
                        else
                        {
                            //TODO: DANGER DANGER why do "contained" token types have this inconsistent handling where the delimiters are not in the value???
                            SwallowOutstandingCharacterAndCompleteToken(state);
                        }
                    }
                    else
                    {
                        state.ConsumeCurrentCharacterIntoToken();
                    }
                    break;

                case SqlTokenizationType.SingleLT:
                    state.CurrentTokenValue.Append('<');
                    state.CurrentTokenizationType = SqlTokenizationType.OtherOperator;
                    if (state.CurrentChar == '=' || state.CurrentChar == '>' || state.CurrentChar == '<')
                    {
                        AppendCharAndCompleteToken(state);
                    }
                    else
                    {
                        CompleteTokenAndProcessNext(state);
                    }
                    break;

                case SqlTokenizationType.SingleGT:
                    state.CurrentTokenValue.Append('>');
                    state.CurrentTokenizationType = SqlTokenizationType.OtherOperator;
                    if (state.CurrentChar == '=' || state.CurrentChar == '>')
                    {
                        AppendCharAndCompleteToken(state);
                    }
                    else
                    {
                        CompleteTokenAndProcessNext(state);
                    }
                    break;

                case SqlTokenizationType.SingleAsterisk:
                    state.CurrentTokenValue.Append('*');
                    if (state.CurrentChar == '=')
                    {
                        state.CurrentTokenizationType = SqlTokenizationType.OtherOperator;
                        AppendCharAndCompleteToken(state);
                    }
                    else
                    {
                        CompleteTokenAndProcessNext(state);
                    }
                    break;

                case SqlTokenizationType.SingleOtherCompoundableOperator:
                    state.CurrentTokenizationType = SqlTokenizationType.OtherOperator;
                    if (state.CurrentChar == '=')
                    {
                        AppendCharAndCompleteToken(state);
                    }
                    else
                    {
                        CompleteTokenAndProcessNext(state);
                    }
                    break;

                case SqlTokenizationType.SinglePipe:
                    state.CurrentTokenizationType = SqlTokenizationType.OtherOperator;
                    state.CurrentTokenValue.Append('|');
                    if (state.CurrentChar == '=' || state.CurrentChar == '|')
                    {
                        AppendCharAndCompleteToken(state);
                    }
                    else
                    {
                        CompleteTokenAndProcessNext(state);
                    }
                    break;

                case SqlTokenizationType.SingleEquals:
                    state.CurrentTokenValue.Append('=');
                    if (state.CurrentChar == '=')
                    {
                        AppendCharAndCompleteToken(state);
                    }
                    else
                    {
                        CompleteTokenAndProcessNext(state);
                    }
                    break;

                case SqlTokenizationType.SingleExclamation:
                    state.CurrentTokenValue.Append('!');
                    if (state.CurrentChar == '=' || state.CurrentChar == '<' || state.CurrentChar == '>')
                    {
                        state.CurrentTokenizationType = SqlTokenizationType.OtherOperator;
                        AppendCharAndCompleteToken(state);
                    }
                    else
                    {
                        state.CurrentTokenizationType = SqlTokenizationType.OtherNode;
                        CompleteTokenAndProcessNext(state);
                    }
                    break;

                default:
                    throw new Exception("In-progress node unrecognized!");
                }

                state.ReadNextCharacter();
            }


            if (state.CurrentTokenizationType != null)
            {
                if (state.CurrentTokenizationType.Value == SqlTokenizationType.BlockComment ||
                    state.CurrentTokenizationType.Value == SqlTokenizationType.String ||
                    state.CurrentTokenizationType.Value == SqlTokenizationType.NString ||
                    state.CurrentTokenizationType.Value == SqlTokenizationType.QuotedString ||
                    state.CurrentTokenizationType.Value == SqlTokenizationType.BracketQuotedName
                    )
                {
                    state.TokenContainer.HasUnfinishedToken = true;
                }

                SwallowOutstandingCharacterAndCompleteToken(state);
            }

            return(state.TokenContainer);
        }