private static TokenSet TokenizeDelimited(Token source) { string[] startDelimiters = { "/*", "--", "'", "[" }; string[] endDelimiters = { "*/", "\n", "'", "]" }; string[] noMatches = { null, null, "''", null }; TokenType[] tokenTypes = { TokenType.Comment, TokenType.Comment, TokenType.StringValue, TokenType.Identifier }; string startDelimiter = "START"; int[] startFound = new int[startDelimiters.Length]; string endDelimiter = "END"; string noMatch = null; TokenType tokenType = TokenType.Unknown; for (int i = 0; i < startDelimiters.Length; i++) { startFound[i] = source.Value.IndexOf(startDelimiters[i]); } TokenSet tokens = new TokenSet(); int offset = 0; int skipped = 0; int start = 0; bool tokenStarted = false; while (start < source.Value.Length) { int found = -1; if (tokenStarted) { found = source.Value.IndexOf(endDelimiter, start + skipped); } else { for (int i = 0; i < startDelimiters.Length; i++) { if (startFound[i] != -1 && startFound[i] < start) { startFound[i] = source.Value.IndexOf(startDelimiters[i], start); } } //These things all escape whatever is found within them, so find the first one for (int i = 0; i < startDelimiters.Length; i++) { int justFound = startFound[i]; if (justFound >= 0 && (found < 0 || justFound < found)) { found = justFound; startDelimiter = startDelimiters[i]; endDelimiter = endDelimiters[i]; noMatch = noMatches[i]; tokenType = tokenTypes[i]; } } } string token; TokenType addingType; if (found < 0) { if (endDelimiter == "\n") { //newlines are actually optional addingType = tokenStarted ? tokenType : TokenType.Unknown; tokenStarted = false; } else { addingType = TokenType.Unknown; } token = source.Value.Substring(start, source.Value.Length - start); start = source.Value.Length; } else if (tokenStarted) { //don't split on things we should leave in the middle int skipLength = ShouldSkip(source, found, noMatch); if (skipLength > 0) { skipped = found - start + skipLength; continue; } token = source.Value.Substring(start - 1, found - start + 1 + endDelimiter.Length); addingType = tokenType; start = found + endDelimiter.Length; tokenStarted = false; //do some validation if (!token.StartsWith(startDelimiter) || !token.EndsWith(endDelimiter)) { throw new ApplicationException("Bad tokenizing!"); } } else { token = source.Value.Substring(start, found - start); addingType = TokenType.Unknown; start = found + 1; tokenStarted = true; } offset = source.Value.IndexOf(token, offset); tokens.Add(new Token(token, addingType, source.StartIndex + offset)); offset += token.Length; //don't find the same text twice if it's repeated skipped = 0; } if (tokenStarted) { throw new ApplicationException("Tokenizer error, found \"" + startDelimiter + "\" without \"" + endDelimiter + "\"."); } return(tokens); }
/// <summary> /// Tokenizes the specified script. /// </summary> /// <param name="script">The script.</param> /// <returns></returns> public static TokenSet Tokenize(string script) { parserFile = -1; Token seed = new Token(script, TokenType.Unknown, 0); TokenSet start = new TokenSet(); start.Add(seed); DisplayHTMLParseStep(start, "Starting set", true); //pull out tokens for comments, strings, escaped names, etc TokenSet escapedTokens = TokenizeDelimited(seed); //pull apart everything else on whitespace TokenSet whitespaceTokens = new TokenSet(); foreach (Token token in escapedTokens) { if (token.Type == TokenType.Unknown) { //make life easier by creating optional whitespace string spread = token.Value; foreach (char op in (OPERATORS + ",.();").ToCharArray()) { spread = spread.Replace(op.ToString(), " " + op + " "); } int offset = 0; foreach (string piece in spread.Split(" \t\r\n".ToCharArray())) { if (piece.Length == 0) { continue; } offset = token.Value.IndexOf(piece, offset); whitespaceTokens.Add(new Token(piece, TokenType.Unknown, token.StartIndex + offset)); offset += piece.Length; //don't find the same text twice if it's repeated } } else { whitespaceTokens.Add(token); } } //remove bogus tokens TokenSet finalTokens = new TokenSet(); TokenEnumerator enumerator = whitespaceTokens.GetEnumerator(); while (enumerator.MoveNext()) { //empty tokens if (enumerator.Current.Value == "") { continue; } //bogus unicode string markings if (enumerator.Current.Value == "N" && enumerator.Next != null && enumerator.Next.Type == TokenType.StringValue) { continue; } finalTokens.Add(enumerator.Current); } DisplayHTMLParseStep(finalTokens, "After empty tokens removed", false); //comments gum things up RemoveComments(finalTokens); DisplayHTMLParseStep(finalTokens, "After comments removed", false); //Categorization IdentifySpecialTokens(finalTokens); DisplayHTMLParseStep(finalTokens, "After identifying special tokens", false); IdentifyRemainingTokens(finalTokens); DisplayHTMLParseStep(finalTokens, "After identifying remaining tokens", false); //associate the tokens with each other finalTokens = CreateTree(finalTokens); DisplayHTMLParseStep(finalTokens, "After tree creation", false); return(finalTokens); }