public static Token GetNextToken(Token token) { if (token == null) { throw new ArgumentNullException("token"); } return GetNextToken(token.Chars, token.CharsLength, token.EndIndex, token.EndState); }
public static Token GetNextToken(char[] chars, int length, int startIndex, int startState) { if (chars == null) { throw new ArgumentNullException("chars"); } if (startIndex >= length) { return null; } int state = startState; bool inScript = ((startState & HtmlTokenizerStates.ScriptState) != 0); int scriptState = (inScript ? HtmlTokenizerStates.ScriptState : 0); bool inStyle = ((startState & HtmlTokenizerStates.StyleState) != 0); int styleState = (inStyle ? HtmlTokenizerStates.StyleState : 0); bool hasRunAt = ((startState & HtmlTokenizerStates.RunAtState) != 0); int runAtState = (hasRunAt ? HtmlTokenizerStates.RunAtState : 0); bool hasRunAtServer = ((startState & HtmlTokenizerStates.RunAtServerState) != 0); int runAtServerState = (hasRunAtServer ? HtmlTokenizerStates.RunAtServerState : 0); int index = startIndex; int tokenStart = startIndex; // inclusive int tokenEnd = startIndex; // exclusive Token token = null; while ((token == null) && (index < length)) { char c = chars[index]; switch (state & 0xFF) { case HtmlTokenizerStates.Text: if (c == '<') { state = HtmlTokenizerStates.StartTag; tokenEnd = index; token = new Token(Token.TextToken, state, tokenStart, tokenEnd, chars, length); } break; case HtmlTokenizerStates.StartTag: if (c == '<') { if ((index + 1 < length) && (chars[index + 1] == '%')) { // Include the open bracket in a server-side script token state = HtmlTokenizerStates.ServerSideScript | scriptState | styleState; tokenStart = index; } else { state = HtmlTokenizerStates.ExpTag | scriptState | styleState; tokenEnd = index + 1; token = new Token(Token.OpenBracket, state, tokenStart, tokenEnd, chars, length); } } else { state = HtmlTokenizerStates.Error; } break; case HtmlTokenizerStates.ExpTag: if (c == '/') { state = HtmlTokenizerStates.ForwardSlash | scriptState | styleState; tokenEnd = index; token = new Token(Token.Empty, state, tokenStart, tokenEnd, chars, length); } else if (c == '!') { state = HtmlTokenizerStates.BeginCommentTag1 | scriptState | styleState; tokenStart = index; } else if (c == '%') { state = HtmlTokenizerStates.ServerSideScript; tokenStart = index; } else if (IsWordChar(c)) { // If we get a word char, go to the in tag state state = HtmlTokenizerStates.InTagName | scriptState | styleState; tokenStart = index; } else { state = HtmlTokenizerStates.Error; } break; case HtmlTokenizerStates.ServerSideScript: int endServerSideScriptIndex = IndexOf(chars, index, length, "%>"); if (endServerSideScriptIndex > -1) { state = HtmlTokenizerStates.Text; // Include the percent and close bracket in the server side script tokenEnd = endServerSideScriptIndex + 2; token = new Token(Token.InlineServerScript, state, tokenStart, tokenEnd, chars, length); } else { index = length; tokenEnd = index; } break; case HtmlTokenizerStates.ForwardSlash: if (c == '/') { state = HtmlTokenizerStates.ExpTagAfterSlash | scriptState | styleState; tokenEnd = index + 1; token = new Token(Token.ForwardSlash, state, tokenStart, tokenEnd, chars, length); } else { state = HtmlTokenizerStates.Error; } break; case HtmlTokenizerStates.ExpTagAfterSlash: if (IsWordChar(c)) { // If we get a word char, go to the in tag state state = HtmlTokenizerStates.InTagName | scriptState | styleState; tokenStart = index; } else { state = HtmlTokenizerStates.Error; } break; case HtmlTokenizerStates.InTagName: if (IsWhitespace(c)) { // If we hit whitespace, return a token state = HtmlTokenizerStates.ExpAttr; tokenEnd = index; string tagName = new String(chars, tokenStart, tokenEnd - tokenStart); if (tagName.ToLower().Equals("script")) { if (!inScript) { state |= HtmlTokenizerStates.ScriptState; } } else if (tagName.ToLower().Equals("style")) { if (!inStyle) { state |= HtmlTokenizerStates.StyleState; } } token = new Token(Token.TagName, state, tokenStart, tokenEnd, chars, length); } else if (c == '>') { state = HtmlTokenizerStates.EndTag; tokenEnd = index; string tagName = new String(chars, tokenStart, tokenEnd - tokenStart); if (tagName.ToLower().Equals("script")) { if (!inScript) { state |= HtmlTokenizerStates.ScriptState; } } else if (tagName.ToLower().Equals("style")) { if (!inStyle) { state |= HtmlTokenizerStates.StyleState; } } token = new Token(Token.TagName, state, tokenStart, tokenEnd, chars, length); } else if (IsWordChar(c)) { // Keep traversing if we get a word char } else if (c == '/') { state = HtmlTokenizerStates.SelfTerminating; tokenEnd = index; string tagName = new String(chars, tokenStart, tokenEnd - tokenStart); if (tagName.ToLower().Equals("script")) { if (!inScript) { state |= HtmlTokenizerStates.ScriptState; } } else if (tagName.ToLower().Equals("style")) { if (!inStyle) { state |= HtmlTokenizerStates.StyleState; } } token = new Token(Token.TagName, state, tokenStart, tokenEnd, chars, length); } else { state = HtmlTokenizerStates.Error; } break; case HtmlTokenizerStates.BeginCommentTag1: if (c == '-') { state = HtmlTokenizerStates.BeginCommentTag2; } else if (IsWordChar(c)) { // This will allow the tokenizer to recognize xml directives as normal tags state = HtmlTokenizerStates.XmlDirective; } else { state = HtmlTokenizerStates.Error; } break; case HtmlTokenizerStates.BeginCommentTag2: if (c == '-') { state = HtmlTokenizerStates.InCommentTag; } else { state = HtmlTokenizerStates.Error; } break; case HtmlTokenizerStates.InCommentTag: if (c == '-') { state = HtmlTokenizerStates.EndCommentTag1; } break; case HtmlTokenizerStates.EndCommentTag1: if (c == '-') { state = HtmlTokenizerStates.EndCommentTag2; } else { state = HtmlTokenizerStates.InCommentTag; } break; case HtmlTokenizerStates.EndCommentTag2: if (Char.IsWhiteSpace(c)) { } else if (c == '>') { state = HtmlTokenizerStates.EndTag; tokenEnd = index; token = new Token(Token.Comment, state, tokenStart, tokenEnd, chars, length); } else { state = HtmlTokenizerStates.InCommentTag; } break; case HtmlTokenizerStates.XmlDirective: if (c == '>') { state = HtmlTokenizerStates.EndTag; tokenEnd = index; token = new Token(Token.XmlDirective, state, tokenStart, tokenEnd, chars, length); } break; case HtmlTokenizerStates.ExpAttr: if (IsWordChar(c)) { state = HtmlTokenizerStates.InAttr | scriptState | styleState | runAtServerState; tokenEnd = index; token = new Token(Token.Whitespace, state, tokenStart, tokenEnd, chars, length); } else if (c == '>') { state = HtmlTokenizerStates.EndTag | scriptState | styleState | runAtServerState; tokenEnd = index; token = new Token(Token.Whitespace, state, tokenStart, tokenEnd, chars, length); } else if (c == '/') { state = HtmlTokenizerStates.SelfTerminating | scriptState | styleState; tokenEnd = index; token = new Token(Token.Whitespace, state, tokenStart, tokenEnd, chars, length); } else if (IsWhitespace(c)) { } else { state = HtmlTokenizerStates.Error; } break; case HtmlTokenizerStates.InAttr: if (IsWhitespace(c)) { // If we hit whitespace, return a token state = HtmlTokenizerStates.ExpEquals | scriptState | styleState | runAtServerState; tokenEnd = index; if (inScript) { // Check if this is a runat="server" script block if (new String(chars, tokenStart, tokenEnd - tokenStart).ToLower() == "runat") { state |= HtmlTokenizerStates.RunAtState; } } token = new Token(Token.AttrName, state, tokenStart, tokenEnd, chars, length); } else if (c == '=') { state = HtmlTokenizerStates.ExpEquals | scriptState | styleState | runAtServerState; tokenEnd = index; if (inScript) { // Check if this is a runat="server" script block if (new String(chars, tokenStart, tokenEnd - tokenStart).ToLower() == "runat") { state |= HtmlTokenizerStates.RunAtState; } } token = new Token(Token.AttrName, state, tokenStart, tokenEnd, chars, length); } else if (c == '>') { state = HtmlTokenizerStates.EndTag | scriptState | styleState | runAtServerState; tokenEnd = index; token = new Token(Token.AttrName, state, tokenStart, tokenEnd, chars, length); } else if (c == '/') { state = HtmlTokenizerStates.SelfTerminating | scriptState | styleState; tokenEnd = index; token = new Token(Token.AttrName, state, tokenStart, tokenEnd, chars, length); } else if (IsWordChar(c)) { // Keep traversing if we get a word char } else { state = HtmlTokenizerStates.Error; } break; case HtmlTokenizerStates.ExpEquals: if (c == '=') { state = HtmlTokenizerStates.ExpAttrVal | scriptState | styleState | runAtState | runAtServerState; tokenStart = index; tokenEnd = index + 1; token = new Token(Token.EqualsChar, state, tokenStart, tokenEnd, chars, length); } else if (c == '>') { state = HtmlTokenizerStates.EndTag | scriptState | styleState | runAtServerState; tokenEnd = index; token = new Token(Token.Whitespace, state, tokenStart, tokenEnd, chars, length); } else if (c == '/') { state = HtmlTokenizerStates.SelfTerminating; tokenEnd = index; token = new Token(Token.Whitespace, state, tokenStart, tokenEnd, chars, length); } else if (IsWordChar(c)) { state = HtmlTokenizerStates.InAttr | scriptState | styleState | runAtServerState; tokenEnd = index; token = new Token(Token.Whitespace, state, tokenStart, tokenEnd, chars, length); } else if (IsWhitespace(c)) { } else { state = HtmlTokenizerStates.Error; } break; case HtmlTokenizerStates.EqualsChar: if (c == '=') { state = HtmlTokenizerStates.ExpAttrVal | scriptState | styleState | runAtState | runAtServerState; tokenEnd = index + 1; token = new Token(Token.EqualsChar, state, tokenStart, tokenEnd, chars, length); } else { state = HtmlTokenizerStates.Error; } break; case HtmlTokenizerStates.ExpAttrVal: if (c == '\'') { state = HtmlTokenizerStates.BeginSingleQuote | scriptState | styleState | runAtState | runAtServerState; tokenEnd = index; token = new Token(Token.Whitespace, state, tokenStart, tokenEnd, chars, length); } else if (c == '\"') { state = HtmlTokenizerStates.BeginDoubleQuote | scriptState | styleState | runAtState | runAtServerState; tokenEnd = index; token = new Token(Token.Whitespace, state, tokenStart, tokenEnd, chars, length); } else if (IsWordChar(c)) { state = HtmlTokenizerStates.InAttrVal | scriptState | styleState | runAtState | runAtServerState; tokenEnd = index; token = new Token(Token.Whitespace, state, tokenStart, tokenEnd, chars, length); } else if (IsWhitespace(c)) { } else { state = HtmlTokenizerStates.Error; } break; case HtmlTokenizerStates.BeginDoubleQuote: if (c == '\"') { state = HtmlTokenizerStates.InDoubleQuoteAttrVal | scriptState | styleState | runAtState | runAtServerState; tokenEnd = index + 1; token = new Token(Token.DoubleQuote, state, tokenStart, tokenEnd, chars, length); } else { state = HtmlTokenizerStates.Error; } break; case HtmlTokenizerStates.InDoubleQuoteAttrVal: if (c == '\"') { state = HtmlTokenizerStates.EndDoubleQuote | scriptState | styleState | runAtServerState; tokenEnd = index; if ((hasRunAt) && (new String(chars, tokenStart, tokenEnd - tokenStart).ToLower() == "server")) { state |= HtmlTokenizerStates.RunAtServerState; } token = new Token(Token.AttrVal, state, tokenStart, tokenEnd, chars, length); } break; case HtmlTokenizerStates.EndDoubleQuote: if (c == '\"') { state = HtmlTokenizerStates.ExpAttr | scriptState | styleState | runAtServerState; tokenEnd = index + 1; token = new Token(Token.DoubleQuote, state, tokenStart, tokenEnd, chars, length); } else { state = HtmlTokenizerStates.Error; } break; case HtmlTokenizerStates.BeginSingleQuote: if (c == '\'') { state = HtmlTokenizerStates.InSingleQuoteAttrVal | scriptState | styleState | runAtState | runAtServerState; tokenEnd = index + 1; token = new Token(Token.SingleQuote, state, tokenStart, tokenEnd, chars, length); } else { state = HtmlTokenizerStates.Error; } break; case HtmlTokenizerStates.InSingleQuoteAttrVal: if (c == '\'') { state = HtmlTokenizerStates.EndSingleQuote | scriptState | styleState | runAtServerState; tokenEnd = index; if ((hasRunAt) && (new String(chars, tokenStart, tokenEnd - tokenStart).ToLower() == "server")) { state |= HtmlTokenizerStates.RunAtServerState; } token = new Token(Token.AttrVal, state, tokenStart, tokenEnd, chars, length); } break; case HtmlTokenizerStates.EndSingleQuote: if (c == '\'') { state = HtmlTokenizerStates.ExpAttr | scriptState | styleState | runAtServerState; tokenEnd = index + 1; token = new Token(Token.SingleQuote, state, tokenStart, tokenEnd, chars, length); } else { state = HtmlTokenizerStates.Error; } break; case HtmlTokenizerStates.InAttrVal: if (IsWhitespace(c)) { state = HtmlTokenizerStates.ExpAttr | scriptState | styleState | runAtServerState; tokenEnd = index; if ((hasRunAt) && (new String(chars, tokenStart, tokenEnd - tokenStart).ToLower() == "server")) { state |= HtmlTokenizerStates.RunAtServerState; } token = new Token(Token.AttrVal, state, tokenStart, tokenEnd, chars, length); } else if (c == '>') { state = HtmlTokenizerStates.EndTag | scriptState | styleState | runAtServerState; tokenEnd = index; if ((hasRunAt) && (new String(chars, tokenStart, tokenEnd - tokenStart).ToLower() == "server")) { state |= HtmlTokenizerStates.RunAtServerState; } token = new Token(Token.AttrVal, state, tokenStart, tokenEnd, chars, length); } else if (c == '/') { // This check fixes a bug when there's a forward slash in an attrval (since Trident likes to remove // double quotes from our attrvals if (((index + 1) < length) && (chars[index + 1] == '>')) { state = HtmlTokenizerStates.SelfTerminating | scriptState | styleState | runAtServerState; tokenEnd = index; if ((hasRunAt) && (new String(chars, tokenStart, tokenEnd - tokenStart).ToLower() == "server")) { state |= HtmlTokenizerStates.RunAtServerState; } token = new Token(Token.AttrVal, state, tokenStart, tokenEnd, chars, length); } } break; case HtmlTokenizerStates.SelfTerminating: if ((c == '/') && (index + 1 < length) && (chars[index + 1] == '>')) { state = HtmlTokenizerStates.Text; tokenEnd = index + 2; token = new Token(Token.SelfTerminating, state, tokenStart, tokenEnd, chars, length); } else { state = HtmlTokenizerStates.Error; } break; case HtmlTokenizerStates.EndTag: if (c == '>') { if (inScript) { state = HtmlTokenizerStates.Script | scriptState | styleState | runAtServerState; } else if (inStyle) { state = HtmlTokenizerStates.Style | scriptState | styleState; } else { state = HtmlTokenizerStates.Text; } tokenEnd = index + 1; token = new Token(Token.CloseBracket, state, tokenStart, tokenEnd, chars, length); } else { state = HtmlTokenizerStates.Error; } break; case HtmlTokenizerStates.Script: int endScriptIndex = IndexOf(chars, index, length, "</script>"); if (endScriptIndex > -1) { state = HtmlTokenizerStates.StartTag | scriptState | styleState | runAtServerState; tokenEnd = endScriptIndex; if (hasRunAtServer) { token = new Token(Token.ServerScriptBlock, state, tokenStart, tokenEnd, chars, length); } else { token = new Token(Token.ClientScriptBlock, state, tokenStart, tokenEnd, chars, length); } } else { index = length - 1; tokenEnd = index; } break; case HtmlTokenizerStates.Style: int endStyleIndex = IndexOf(chars, index, length, "</style>"); if (endStyleIndex > -1) { state = HtmlTokenizerStates.StartTag | scriptState | styleState; tokenEnd = endStyleIndex; token = new Token(Token.Style, state, tokenStart, tokenEnd, chars, length); } else { index = length - 1; tokenEnd = index; } break; case HtmlTokenizerStates.Error: if (c == '>') { state = HtmlTokenizerStates.EndTag; tokenEnd = index; token = new Token(Token.Error, state, tokenStart, tokenEnd, chars, length); } break; } index++; } if ((index >= length) && (token == null)) { int tokenType; // Some tokens can span multiple lines, so return a token if we haven't found one yet switch (state & 0xFF) { case HtmlTokenizerStates.Text: tokenType = Token.TextToken; break; case HtmlTokenizerStates.Script: if (hasRunAtServer) { tokenType = Token.ServerScriptBlock; } else { tokenType = Token.ClientScriptBlock; } break; case HtmlTokenizerStates.Style: tokenType = Token.Style; break; case HtmlTokenizerStates.ServerSideScript: tokenType = Token.InlineServerScript; break; case HtmlTokenizerStates.BeginCommentTag1: case HtmlTokenizerStates.BeginCommentTag2: case HtmlTokenizerStates.InCommentTag: case HtmlTokenizerStates.EndCommentTag1: case HtmlTokenizerStates.EndCommentTag2: tokenType = Token.Comment; break; default: tokenType = Token.Error; state = HtmlTokenizerStates.Error; break; } tokenEnd = index; token = new Token(tokenType, state, tokenStart, tokenEnd, chars, length); } return token; }