/// <summary> /// Tokenizes the specified script. /// </summary> /// <param name="script">The script.</param> /// <returns></returns> public static TokenSet Tokenize(string script) { parserFile = -1; Token seed = new Token(script, TokenType.Unknown, 0); TokenSet start = new TokenSet(); start.Add(seed); DisplayHTMLParseStep(start, "Starting set", true); //pull out tokens for comments, strings, escaped names, etc TokenSet escapedTokens = TokenizeDelimited(seed); //pull apart everything else on whitespace TokenSet whitespaceTokens = new TokenSet(); foreach (Token token in escapedTokens) { if (token.Type == TokenType.Unknown) { //make life easier by creating optional whitespace string spread = token.Value; foreach (char op in (OPERATORS + ",.();").ToCharArray()) { spread = spread.Replace(op.ToString(), " " + op + " "); } int offset = 0; foreach (string piece in spread.Split(" \t\r\n".ToCharArray())) { if (piece.Length == 0) { continue; } offset = token.Value.IndexOf(piece, offset); whitespaceTokens.Add(new Token(piece, TokenType.Unknown, token.StartIndex + offset)); offset += piece.Length; //don't find the same text twice if it's repeated } } else { whitespaceTokens.Add(token); } } //remove bogus tokens TokenSet finalTokens = new TokenSet(); TokenEnumerator enumerator = whitespaceTokens.GetEnumerator(); while (enumerator.MoveNext()) { //empty tokens if (enumerator.Current.Value == "") { continue; } //bogus unicode string markings if (enumerator.Current.Value == "N" && enumerator.Next != null && enumerator.Next.Type == TokenType.StringValue) { continue; } finalTokens.Add(enumerator.Current); } DisplayHTMLParseStep(finalTokens, "After empty tokens removed", false); //comments gum things up RemoveComments(finalTokens); DisplayHTMLParseStep(finalTokens, "After comments removed", false); //Categorization IdentifySpecialTokens(finalTokens); DisplayHTMLParseStep(finalTokens, "After identifying special tokens", false); IdentifyRemainingTokens(finalTokens); DisplayHTMLParseStep(finalTokens, "After identifying remaining tokens", false); //associate the tokens with each other finalTokens = CreateTree(finalTokens); DisplayHTMLParseStep(finalTokens, "After tree creation", false); return(finalTokens); }
private static void CreateTree_Operator(TokenSet tokens) { //work on operators TokenEnumerator enumerator = tokens.GetEnumerator(); while (enumerator.MoveNext()) { //make sure this is the start of a new group Token starter = enumerator.Current; if (starter.Type != TokenType.Operator || starter.Children.Count > 0) { CreateTree_Operator(starter.Children); continue; } Token previous = null; if (enumerator.Previous != null && starter.Value != "!" && starter.Value != "~") { previous = enumerator.Previous; } Token next = enumerator.Next; //don't bury keywords in the tree if (previous != null && (previous.Type == TokenType.Keyword || previous.Type == TokenType.Separator)) { CreateTree_Operator(starter.Children); continue; } //add previous operand if not unary (don't remove - screws up adding of next operand) if (previous != null && previous.Type != TokenType.Comment && previous.Type != TokenType.Keyword && previous.Type != TokenType.StringValue) { starter.Children.Add(previous); } else { previous = null; } //add next operand while (next != null) { starter.Children.Add(next); enumerator.RemoveNext(); if (next.Type == TokenType.Operator) { next = enumerator.Next; } else { next = null; } } //remove any previous operand if (previous != null) { enumerator.RemovePrevious(); } //make a tree of those children too CreateTree_Operator(starter.Children); } }
private static void IdentifySpecialTokens(TokenSet tokens) { //identify the tokens foreach (Token token in tokens) { //only work on unidentified tokens if (token.Type != TokenType.Unknown) { continue; } //identify the operators if (token.Value.Length == 1 && OPERATORS.IndexOf(token.Value) > -1) { token.Type = TokenType.Operator; continue; } //identify variables if (token.Value.StartsWith("@")) { token.Type = TokenType.Variable; continue; } //pull other types switch (token.Value.ToLower()) { case ".": token.Type = TokenType.Dot; break; case ";": token.Type = TokenType.Semicolon; break; case "begin": case "(": token.Type = TokenType.GroupBegin; break; case "case": token.Type = TokenType.CaseStatement; break; case "end": case ")": token.Type = TokenType.GroupEnd; break; case "add": case "alter": case "and": case "clustered": case "collate": case "constraint": case "create": case "default": case "drop": case "else": case "exists": case "for": case "from": case "function": case "go": case "identity": case "if": case "in": case "index": case "is": case "key": case "nocheck": case "nonclustered": case "not": case "null": case "on": case "or": case "proc": case "procedure": case "primary": case "select": case "table": case "tran": case "transaction": case "trigger": case "then": case "unique": case "view": case "when": case "where": case "with": token.Type = TokenType.Keyword; break; case ",": token.Type = TokenType.Separator; break; case "'": case "\"": token.Type = TokenType.Quote; break; } } //fix misidentified tokens TokenEnumerator enumerator = tokens.GetEnumerator(); while (enumerator.MoveNext()) { Token previous = enumerator.Previous; Token current = enumerator.Current; Token next = enumerator.Next; if (current.Type == TokenType.GroupBegin && next.Type == TokenType.Keyword && current.Value.ToLower() == "begin" && next.Value.ToLower().StartsWith("tran")) { current.Type = TokenType.Keyword; } else if (previous != null && current.Type == TokenType.GroupBegin && previous.Type == TokenType.Quote && next.Type == TokenType.Quote) { current.Type = TokenType.StringValue; } } //coalese operators and things with dots enumerator = tokens.GetEnumerator(); enumerator.MoveLast(); while (enumerator.MovePrevious()) { Token previous = enumerator.Previous; Token current = enumerator.Current; Token next = enumerator.Next; //do the coalesce but don't screw up +5 * -2 if (next != null && current.Type == TokenType.Operator && next.Type == TokenType.Operator && next.Value != "-" && next.Value != "+") { current.Value = current.Value + next.Value; enumerator.RemoveNext(); continue; } else if (current.Type == TokenType.Dot && previous != null && next != null && (previous.Type == TokenType.Unknown || previous.Type == TokenType.Identifier || previous.Type == TokenType.Dot) && (next.Type == TokenType.Unknown || next.Type == TokenType.Identifier || next.Type == TokenType.Dot)) { current.StartIndex = previous.StartIndex; current.Value = previous.FlattenTree() + "." + next.FlattenTree(); enumerator.RemovePrevious(); enumerator.RemoveNext(); continue; } } }
private static void CreateTree_Grouping(TokenSet tokens) { //start with grouping constructs TokenEnumerator enumerator = tokens.GetEnumerator(); while (enumerator.MoveNext()) { //make sure this is the start of a group if (enumerator.Current.Type != TokenType.GroupBegin) { continue; } //pull in all children Stack <Token> GroupStarters = new Stack <Token>(); GroupStarters.Push(enumerator.Current); //push the group under its predecessor for functions... if (enumerator.Previous != null && enumerator.Previous.Children.Count == 0 && (enumerator.Previous.Type == TokenType.Identifier || enumerator.Previous.Type == TokenType.Unknown)) { enumerator.Previous.Children.Add(enumerator.Current); enumerator.RemoveCurrent(); } while (GroupStarters.Count > 0) { enumerator.MoveNext(); if (!enumerator.IsValid) { throw new ApplicationException("Unclosed " + GroupStarters.Peek().Value); } Token child = enumerator.Current; if (child == null) { throw new ApplicationException("Unclosed " + GroupStarters.Peek().Value); } enumerator.RemoveCurrent(); Token group = GroupStarters.Peek(); Token last = group.Children.Count > 0 ? group.Children.Last : null; if (last != null && last.Children.Count == 0 && (last.Type == TokenType.Identifier || last.Type == TokenType.Unknown) && (child.Type == TokenType.GroupBegin || child.Type == TokenType.CaseStatement)) { //push the group under its predecessor for functions... last.Children.Add(child); } else { group.Children.Add(child); } if (child.Type == TokenType.GroupBegin || child.Type == TokenType.CaseStatement) { GroupStarters.Push(child); } else if (child.Type == TokenType.GroupEnd) { GroupStarters.Pop(); } } } }