/// <summary> /// Use this to prepare the iterator object to continue finding siblings. It retains the parent. /// It just avoids having to recreate an instance of this object for the next tag. /// </summary> public void Reset() { TokenizerState = TokenizerState.Default; HtmlStart = Pos; InsertionMode = InsertionMode.Default; Element = null; }
/// <summary> /// Returns a literal object for the text between HtmlStart (the last position of the end of a /// tag) and the current position. If !AllowLiterals then it's wrapped in a span. /// </summary> /// /// <param name="factory"> /// The HTML factory to operate against /// </param> /// <param name="literal"> /// [out] The literal. /// </param> /// /// <returns> /// true if it succeeds, false if it fails. /// </returns> public bool TryGetLiteral(HtmlElementFactory factory, out IDomObject literal) { if (Pos <= HtmlStart) { literal = null; return(false); } // There's plain text -return it as a literal. DomText lit; switch (InsertionMode) { case InsertionMode.Invalid: lit = new DomInvalidElement(); break; case InsertionMode.Text: InsertionMode = InsertionMode.Default; lit = new DomInnerText(); break; default: lit = new DomText(); break; } literal = lit; //if (factory.IsBound) //{ // lit.SetTextIndex(factory.Document, factory.Document.DocumentIndex.TokenizeString(HtmlStart, Pos - HtmlStart)); //} //else //{ string text = factory.Html.SubstringBetween(HtmlStart, Pos); literal.NodeValue = HtmlData.HtmlDecode(text); //} if (WrapLiterals) { DomElement wrapper = DomElement.Create("span"); wrapper.AppendChildUnsafe(literal); literal = wrapper; } if (Parent != null) { ((DomElement)Parent.Element).AppendChildUnsafe(literal); Reset(); return(false); } else { TokenizerState = TokenizerState.Finished; return(true); } }
public TokenizerState ProcessChar(char c, string fullExpression, int currentIndex) { TokenizerState state = _tokenProcessors[_current].ProcessChar(c, fullExpression, currentIndex); if (state == TokenizerState.Success) { _current++; if (_current == _tokenProcessors.Length) { return(TokenizerState.Success); } _startIndexes[_current] = currentIndex - _firstIndex; _tokenProcessors[_current].ResetState(); state = _tokenProcessors[_current].ProcessChar(c, fullExpression, currentIndex); } if (state == TokenizerState.Fail) { return(TokenizerState.Fail); } if (_current == 0 && _firstIndex < 0) { _firstIndex = currentIndex; } return(TokenizerState.Valid); }
private bool TryFillTokenIfValidAtInputEnd(Token token, TokenizerState state) { switch (state) { case TokenizerState.Begin: return(false); case TokenizerState.Indentation: return(true); case TokenizerState.Identifier: return(true); case TokenizerState.Number: return(true); case TokenizerState.String: return(false); case TokenizerState.NewLineCR: return(true); default: throw new ArgumentException($"Unexpected state for Tokenizer: '{state}'"); } }
public static PartialExceptionWithContext<TokenException> IllegalStateAt(char c, int index, TokenizerState state) { string msg = String.Format("Tokenizer got invalid state at position {1}({0}). Tokenizer was in state {2}", c, index, state); return MakePartial(new TokenException(msg)); }
private void EndToken(TokenType tokenType) { _currentToken.TokenType = tokenType; _tokens.Add(_currentToken); _currentTokenizerState = TokenizerState.Default; _currentToken = new Token(TokenType.Unknown, string.Empty); }
public XamlTokenizer(TextReader reader) { this.reader = reader; lineNumber = 1; charPosition = 1; state = TokenizerState.NotStarted; }
void EmitOrAppendStrBuf(TokenizerState returnState) { //if ((returnState & DATA_AND_RCDATA_MASK) != 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) == 0) { AppendStrBufToLongStrBuf(); } else { EmitStrBuf(); } }
internal static Action<Stack<TokenizerState>> ReplaceState(TokenizerState state, short pushCount) { if (pushCount <= 0) throw new ArgumentOutOfRangeException("pushCount", pushCount, "Must be a positive number."); return states => { states.Pop(); for (var i = 0; i < pushCount; i ++) states.Push(state); }; }
private void DetermineState() { if (Literal()) { _state = TokenizerState.Literal; } else if (Seperator()) { _state = TokenizerState.Seperator; } else { _state = TokenizerState.Normal; } }
public TokenizerState Feed(char c, string fullExpression, int currentIndex) { if (!_stillValid) { return(TokenizerState.Fail); } TokenizerState state = _tokenProcessor.ProcessChar(c, fullExpression, currentIndex); if (state != TokenizerState.Valid) { _stillValid = false; } return(state); }
/// <summary> /// Close out this element. This method will return true if something can be yielded; this this /// means it's got a parent at the top of the heirarchy. Otherwise it's just closed but false is /// returned. /// </summary> /// /// <param name="factory"> /// The HTML factory to operate against. /// </param> /// /// <returns> /// An enumerator that allows foreach to be used to process close element in this collection. /// </returns> public IEnumerable <IDomObject> CloseElement(HtmlElementFactory factory) { IDomObject element = null; if (TryGetLiteral(factory, out element)) { yield return(element); } if (Parent != null) { if (Parent.Parent == null) { yield return(Parent.Element); } Parent.Reset(Pos); TokenizerState = TokenizerState.Finished; } }
/// <summary> /// Creates the invalid tokenizer state exception. /// </summary> /// <param name="tokenizerState">State of the tokenizer.</param> /// <param name="batchSource">The batch source.</param> /// <returns>A derived <see cref="ParserException"/> based on the state</returns> internal static ParserException CreateInvalidTokenizerStateException( TokenizerState tokenizerState, IBatchSource batchSource) { switch (tokenizerState) { case TokenizerState.SingleQuoteString: case TokenizerState.DoubleQuoteString: return(new UnclosedStringLiteralException(tokenizerState, batchSource)); case TokenizerState.BlockComment: return(new UnclosedBlockCommentException(tokenizerState, batchSource)); default: return(new ParserException($"Unexpected state {tokenizerState}", batchSource)); } }
/// <summary> /// Tries the parse connection string. /// </summary> /// <typeparam name="T"></typeparam> /// <param name="connectionString">The connection string.</param> /// <param name="args">The args.</param> /// <param name="to">To.</param> /// <returns></returns> public static bool TryParseConnectionString <T>(string connectionString, TokenizerArgs args, out T to) where T : class, new() { if (connectionString == null) { throw new ArgumentNullException("connectionString"); } else if (args == null) { throw new ArgumentNullException("args"); } to = null; using (TokenizerState <T> state = NewState <T>(args)) { IList <string> groups = GetWords(connectionString, new string[] { "\"\"", "\'\'" }, '\0', EscapeMode.DoubleItem, ";".ToCharArray()); foreach (string group in groups) { IList <string> parts = GetWords(group, new string[] { "\"\"", "\'\'" }, '\0', EscapeMode.DoubleItem, "=".ToCharArray()); TokenItem token; if ((parts.Count == 2) && state.Definition.TryGetToken(parts[0], args.CaseSensitive, out token)) { token.Evaluate(parts[1], state); } else if (args.SkipUnknownNamedItems) { continue; } else { return(false); } } // TODO: Parse connectionstring using definition to = state.Instance; return(true); } }
/// <summary> /// Formats the tokenizer state error. /// </summary> /// <param name="state">The state.</param> /// <returns>Formatted string.</returns> private static string FormatTokenizerStateError(TokenizerState state) { switch (state) { case TokenizerState.BlockComment: return("Unclosed block comment at end of file"); case TokenizerState.DoubleQuoteString: return("Unclosed double-quote string at end of file"); case TokenizerState.SingleQuoteString: return("Unclosed single-quote string at end of file"); default: return($"Unexpected state at end of file (should not be an error): {state}"); } }
public TokenizerState ProcessChar(char c, string fullExpression, int currentIndex) { TokenizerState returnState = TokenizerState.Fail; foreach (ITokenProcessor matcher in _tokenProcessors) { TokenizerState state = matcher.ProcessChar(c, fullExpression, currentIndex); if (state == TokenizerState.Success) { returnState = state; } if (state == TokenizerState.Valid && returnState == TokenizerState.Fail) { returnState = state; } } return(returnState); }
/// <summary> /// Tries to parse the name value collection. /// </summary> /// <typeparam name="T"></typeparam> /// <param name="collection">The collection.</param> /// <param name="args">The args.</param> /// <param name="to">To.</param> /// <returns></returns> public static bool TryParseNameValueCollection <T>(NameValueCollection collection, TokenizerArgs args, out T to) where T : class, new() { if (collection == null) { throw new ArgumentNullException("collection"); } else if (args == null) { throw new ArgumentNullException("args"); } to = null; using (TokenizerState <T> state = NewState <T>(args)) { for (int i = 0; i < collection.Count; i++) { TokenItem ti; if (!state.Definition.TryGetToken(collection.Keys[i], args.CaseSensitive, out ti)) { if (args.SkipUnknownNamedItems) { continue; } else { return(false); } } ti.Evaluate(collection[i], state); } to = state.Instance; return(true); } }
/// <summary> /// Tries to parse the name value collection. /// </summary> /// <typeparam name="T"></typeparam> /// <param name="collection">The collection.</param> /// <param name="args">The args.</param> /// <param name="to">To.</param> /// <returns></returns> public static bool TryParseNameValueCollection <T>(IDictionary <string, string> collection, TokenizerArgs args, out T to) where T : class, new() { if (collection == null) { throw new ArgumentNullException("collection"); } else if (args == null) { throw new ArgumentNullException("args"); } to = null; using (TokenizerState <T> state = NewState <T>(args)) { foreach (KeyValuePair <string, string> kvp in collection) { TokenItem ti; if (!state.Definition.TryGetToken(kvp.Key, args.CaseSensitive, out ti)) { if (args.SkipUnknownNamedItems) { continue; } else { return(false); } } ti.Evaluate(kvp.Value, state); } to = state.Instance; return(true); } }
public IEnumerable <Token> GetTokens() { if (position >= value.Length) { yield break; } int readCount = 0; bool readCompleted = false; string errorMessage = null; while (!readCompleted) { switch (currentState) { case TokenizerState.ReadyToReadKey: { if (position >= value.Length) { errorMessage = "Unexpected string end in '{0}' state.".FormatInvariant(currentState); currentState = TokenizerState.Error; break; } char currentChar = value[position]; switch (currentChar) { case '=': case '&': errorMessage = "Unexpected character '{0}' in '{1}' state.".FormatInvariant(currentChar, currentState); currentState = TokenizerState.Error; break; case '/': currentState = TokenizerState.Finish; break; default: readCount++; currentState = TokenizerState.ReadKey; break; } break; } case TokenizerState.ReadKey: { if (position >= value.Length) { yield return(CreateToken(TokenType.Key, readCount)); yield return(CreateToken(TokenType.Value, 0)); readCount = 0; currentState = TokenizerState.Finish; break; } char currentChar = value[position]; switch (currentChar) { case '=': yield return(CreateToken(TokenType.Key, readCount)); readCount = 0; currentState = TokenizerState.ReadValue; break; case '&': yield return(CreateToken(TokenType.Key, readCount)); yield return(CreateToken(TokenType.Value, 0)); readCount = 0; currentState = TokenizerState.ReadyToReadKey; break; case '/': yield return(CreateToken(TokenType.Key, readCount)); yield return(CreateToken(TokenType.Value, 0)); readCount = 0; currentState = TokenizerState.Finish; break; default: readCount++; break; } break; } case TokenizerState.ReadValue: { if (position >= value.Length) { yield return(CreateToken(TokenType.Value, readCount)); readCount = 0; currentState = TokenizerState.Finish; break; } char currentChar = value[position]; switch (currentChar) { case '=': errorMessage = "Unexpected character '{0}' in '{1}' state.".FormatInvariant(currentChar, currentState); currentState = TokenizerState.Error; break; case '&': yield return(CreateToken(TokenType.Value, readCount)); readCount = 0; currentState = TokenizerState.ReadyToReadKey; break; case '/': yield return(CreateToken(TokenType.Value, readCount)); readCount = 0; currentState = TokenizerState.Finish; break; default: readCount++; break; } break; } case TokenizerState.Finish: case TokenizerState.Error: readCompleted = true; break; default: throw new NotSupportedException(); } position++; } if (currentState == TokenizerState.Error) { throw new FormatException(errorMessage); } }
/// <summary> /// /// </summary> /// <returns></returns> public XamlToken ReadNextToken() { var on = true; while (on) { switch (state) { case TokenizerState.EndOfStream: { return new XamlToken(XamlTokenType.EndOfStream, lineNumber, charPosition); } case TokenizerState.NotStarted: { var current = ReadNextChar(); if (-1 == current) { state = TokenizerState.EndOfStream; continue; } if (Char.IsWhiteSpace((char) current)) { state = TokenizerState.HeadingWhitespaces; continue; } switch (current) { case '=': case '<': case '>': { state = TokenizerState.Terminal; return new XamlToken(XamlTokenType.Terminal, ((char) current).ToString(), lineNumber, charPosition); } } break; } } } return new XamlToken( XamlTokenType.Terminal, '<'.ToString(), lineNumber, charPosition); }
public void LoadState(Tokenizer other) { strBufLen = other.strBufLen; if (strBufLen > strBuf.Length) { strBuf = new char[strBufLen]; } //Array.Copy(other.strBuf, strBuf, strBufLen); Buffer.BlockCopy(other.strBuf, 0, strBuf, 0, strBufLen << 1); longStrBufLen = other.longStrBufLen; if (longStrBufLen > longStrBuf.Length) { longStrBuf = new char[longStrBufLen]; } //Array.Copy(other.longStrBuf, longStrBuf, longStrBufLen); Buffer.BlockCopy(other.longStrBuf, 0, longStrBuf, 0,longStrBufLen<< 1); stateSave = other.stateSave; returnStateSave = other.returnStateSave; endTagExpectation = other.endTagExpectation; endTagExpectationAsArray = other.endTagExpectationAsArray; // line = 1; XXX line numbers lastCR = other.lastCR; index = other.index; forceQuirks = other.forceQuirks; additional = other.additional; entCol = other.entCol; firstCharKey = other.firstCharKey; lo = other.lo; hi = other.hi; candidate = other.candidate; strBufMark = other.strBufMark; prevValue = other.prevValue; value = other.value; seenDigits = other.seenDigits; endTag = other.endTag; shouldSuspend = false; if (other.doctypeName == null) { doctypeName = null; } else { doctypeName = other.doctypeName; } if (other.systemIdentifier == null) { systemIdentifier = null; } else { systemIdentifier = other.systemIdentifier; } if (other.publicIdentifier == null) { publicIdentifier = null; } else { publicIdentifier = other.publicIdentifier; } if (other.tagName == null) { tagName = null; } else { tagName = other.tagName.CloneElementName(); } if (other.attributeName == null) { attributeName = null; } else { attributeName = other.attributeName.CloneAttributeName(); } if (other.attributes == null) { attributes = null; } else { attributes = other.attributes.CloneAttributes(); } }
public IEnumerator <BaseToken> GetBaseTokens(IEnumerator <char> charEnumer) { TokenizerState state = TokenizerState.ReadNothing; string currentToken = ""; while (charEnumer.MoveNext()) { char c = charEnumer.Current; switch (state) { case TokenizerState.ReadNothing: { if (c == '-') { yield return(new BaseToken(BaseTokenType.minus, "-")); } else if (c == '"') { state = TokenizerState.ReadQuote; } else if (c == '(') { state = TokenizerState.ReadRound; } else if (c == '[') { state = TokenizerState.ReadSquare; } else if (c == ' ' || c == '\t' || c == '\n' || c == '\r') { break; } else { } break; } case TokenizerState.ReadQuote: { if (c == '\\') { state = TokenizerState.ReadSlashInText; } else if (c == '"') { yield return(new BaseToken(BaseTokenType.text, currentToken)); currentToken = ""; state = TokenizerState.ReadNothing; } else { currentToken += c; } break; } case TokenizerState.ReadSlashInText: { currentToken += c; state = TokenizerState.ReadQuote; break; } case TokenizerState.ReadRound: { if (c == ')') { yield return(new BaseToken(BaseTokenType.rndBrktContent, currentToken)); currentToken = ""; state = TokenizerState.ReadNothing; } else { currentToken += c; } break; } case TokenizerState.ReadSquare: { if (c == ']') { yield return(new BaseToken(BaseTokenType.sqrBrktContent, currentToken)); currentToken = ""; state = TokenizerState.ReadNothing; } else { currentToken += c; } break; } } } switch (state) { case TokenizerState.ReadQuote: { throw new UnexpectedEndOfInputError("Met end of input while reading text, expected (\")"); } case TokenizerState.ReadSlashInText: { throw new UnexpectedEndOfInputError("Met end of input after reading \"\\\", expected symbol"); } case TokenizerState.ReadRound: { throw new UnexpectedEndOfInputError("Met end of input while reading round round content, expected \")\""); } case TokenizerState.ReadSquare: { throw new UnexpectedEndOfInputError("Met end of input while reading round square content, expected \"]\""); } case TokenizerState.ReadNothing: { break; } } }
private void HandleNcrValue(TokenizerState returnState) { /* * If one or more characters match the range, then take them all and * interpret the string of characters as a number (either hexadecimal or * decimal as appropriate). */ if (value <= 0xFFFF) { if (value >= 0x80 && value <= 0x9f) { /* * If that number is one of the numbers in the first column of * the following table, then this is a parse error. */ ErrNcrInC1Range(); /* * Find the row with that number in the first column, and return * a character token for the Unicode character given in the * second column of that row. */ char[] val = NamedCharacters.WINDOWS_1252[value - 0x80]; EmitOrAppendOne(val, returnState); // [NOCPP[ } else if (value == 0xC && contentSpacePolicy != XmlViolationPolicy.Allow) { if (contentSpacePolicy == XmlViolationPolicy.AlterInfoset) { EmitOrAppendOne(SPACE, returnState); } else if (contentSpacePolicy == XmlViolationPolicy.Fatal) { Fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space."); } // ]NOCPP] } else if (value == 0x0) { ErrNcrZero(); EmitOrAppendOne(REPLACEMENT_CHARACTER, returnState); } else if ((value & 0xF800) == 0xD800) { ErrNcrSurrogate(); EmitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); } else { /* * Otherwise, return a character token for the Unicode character * whose code point is that number. */ char ch = (char)value; // [NOCPP[ if (value == 0x0D) { ErrNcrCr(); } else if ((value <= 0x0008) || (value == 0x000B) || (value >= 0x000E && value <= 0x001F)) { ch = ErrNcrControlChar(ch); } else if (value >= 0xFDD0 && value <= 0xFDEF) { ErrNcrUnassigned(); } else if ((value & 0xFFFE) == 0xFFFE) { ch = ErrNcrNonCharacter(ch); } else if (value >= 0x007F && value <= 0x009F) { ErrNcrControlChar(); } else { MaybeWarnPrivateUse(ch); } // ]NOCPP] bmpChar[0] = ch; EmitOrAppendOne(bmpChar, returnState); } } else if (value <= 0x10FFFF) { // [NOCPP[ MaybeWarnPrivateUseAstral(); if ((value & 0xFFFE) == 0xFFFE) { ErrAstralNonCharacter(value); } // ]NOCPP] astralChar[0] = (char)(LEAD_OFFSET + (value >> 10)); astralChar[1] = (char)(0xDC00 + (value & 0x3FF)); EmitOrAppendTwo(astralChar, returnState); } else { ErrNcrOutOfRange(); EmitOrAppendOne(REPLACEMENT_CHARACTER, returnState); } }
private void EmitOrAppendOne(char[] val, TokenizerState returnState) { //if ((returnState & DATA_AND_RCDATA_MASK) != 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) ==0) { AppendLongStrBuf(val[0]); } else { TokenHandler.Characters(val, 0, 1); } }
void StateLoop3_ScriptData(TokenizerState state, TokenizerState returnState) { /* * Idioms used in this code: * * * Consuming the next input character * * To consume the next input character, the code does this: if (++pos == * endPos) { goto breakStateloop; } c = buf[pos]; * * * Staying in a state * * When there's a state that the tokenizer may stay in over multiple * input characters, the state has a wrapper |for(;;)| loop and staying * in the state continues the loop. * * * Switching to another state * * To switch to another state, the code sets the state variable to the * magic number of the new state. Then it either continues stateloop or * breaks out of the state's own wrapper loop if the target state is * right after the current state in source order. (This is a partial * workaround for Java's lack of goto.) * * * Reconsume support * * The spec sometimes says that an input character is reconsumed in * another state. If a state can ever be entered so that an input * character can be reconsumed in it, the state's code starts with an * |if (reconsume)| that sets reconsume to false and skips over the * normal code for consuming a new character. * * To reconsume the current character in another state, the code sets * |reconsume| to true and then switches to the other state. * * * Emitting character tokens * * This method emits character tokens lazily. Whenever a new range of * character tokens starts, the field cstart must be set to the start * index of the range. The flushChars() method must be called at the end * of a range to flush it. * * * U+0000 handling * * The various states have to handle the replacement of U+0000 with * U+FFFD. However, if U+0000 would be reconsumed in another state, the * replacement doesn't need to happen, because it's handled by the * reconsuming state. * * * LF handling * * Every state needs to increment the line number upon LF unless the LF * gets reconsumed by another state which increments the line number. * * * CR handling * * Every state needs to handle CR unless the CR gets reconsumed and is * handled by the reconsuming state. The CR needs to be handled as if it * were and LF, the lastCR field must be set to true and then this * method must return. The IO driver will then swallow the next * character if it is an LF to coalesce CRLF. */ /* * As there is no support for labeled loops in C#, instead of break <loop>; * the port uses goto break<loop>; and a label after the loop. * Instead of continue <loop>; it uses goto continue<loop>; and a label * at the beginning or end of the loop (which doesn't matter in for(;;) loops) */ /*stateloop:*/ for (; ; ) { //************* continueStateloop: //************* switch (state) { // XXX reorder point case TokenizerState.s06_SCRIPT_DATA: /*scriptdataloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '<': /* * U+003C LESS-THAN SIGN (<) Switch to the * script data less-than sign state. */ FlushChars(); returnState = state; //state = Transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos); state = TokenizerState.s17_SCRIPT_DATA_LESS_THAN_SIGN; goto breakScriptdataloop; // FALL THRU continue // stateloop; case '\u0000': EmitReplacementCharacter(); continue; case '\r': EmitCarriageReturn(); goto breakStateloop; case '\n': default: /* * Anything else Emit the current input * character as a character token. Stay in the * script data state. */ continue; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakScriptdataloop: goto case TokenizerState.s17_SCRIPT_DATA_LESS_THAN_SIGN; } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.s17_SCRIPT_DATA_LESS_THAN_SIGN: /*scriptdatalessthansignloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '/': /* * U+002F SOLIDUS (/) Set the temporary buffer * to the empty string. Switch to the script * data end tag open state. */ index = 0; ClearStrBuf(); //state = Transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); state = TokenizerState.NON_DATA_END_TAG_NAME; goto continueStateloop; case '!': TokenListener.Characters(LT_GT, 0, 1); reader.StartCollect(); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos); state = TokenizerState.s20_SCRIPT_DATA_ESCAPE_START; goto breakScriptdatalessthansignloop; // FALL THRU // continue // stateloop; default: /* * Otherwise, emit a U+003C LESS-THAN SIGN * character token */ TokenListener.Characters(LT_GT, 0, 1); /* * and reconsume the current input character in * the data state. */ reader.StartCollect(); //state = Transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); state = TokenizerState.s06_SCRIPT_DATA; //reconsume = true; reader.StepBack(); goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakScriptdatalessthansignloop: goto case TokenizerState.s20_SCRIPT_DATA_ESCAPE_START; } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.s20_SCRIPT_DATA_ESCAPE_START: /*scriptdataescapestartloop:*/ { char c; while (reader.ReadNext(out c)) { /* * Consume the next input character: */ switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Emit a U+002D * HYPHEN-MINUS character token. Switch to the * script data escape start dash state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos); state = TokenizerState.s21_SCRIPT_DATA_ESCAPE_START_DASH; goto breakScriptdataescapestartloop; // FALL THRU // continue // stateloop; default: /* * Anything else Reconsume the current input * character in the script data state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); state = TokenizerState.s06_SCRIPT_DATA; //reconsume = true; reader.StepBack(); goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakScriptdataescapestartloop: goto case TokenizerState.s21_SCRIPT_DATA_ESCAPE_START_DASH; } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.s21_SCRIPT_DATA_ESCAPE_START_DASH: /*scriptdataescapestartdashloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Emit a U+002D * HYPHEN-MINUS character token. Switch to the * script data escaped dash dash state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); state = TokenizerState.s24_SCRIPT_DATA_ESCAPED_DASH_DASH; goto breakScriptdataescapestartdashloop; // goto continueStateloop; default: /* * Anything else Reconsume the current input * character in the script data state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); state = TokenizerState.s06_SCRIPT_DATA; //reconsume = true; reader.StepBack(); goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakScriptdataescapestartdashloop: goto case TokenizerState.s24_SCRIPT_DATA_ESCAPED_DASH_DASH; } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.s24_SCRIPT_DATA_ESCAPED_DASH_DASH: /*scriptdataescapeddashdashloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Emit a U+002D * HYPHEN-MINUS character token. Stay in the * script data escaped dash dash state. */ continue; case '<': /* * U+003C LESS-THAN SIGN (<) Switch to the * script data escaped less-than sign state. */ FlushChars(); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); state = TokenizerState.s25_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit a U+003E * GREATER-THAN SIGN character token. Switch to * the script data state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); state = TokenizerState.s06_SCRIPT_DATA; goto continueStateloop; case '\u0000': EmitReplacementCharacter(); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.s22_SCRIPT_DATA_ESCAPED; goto breakScriptdataescapeddashdashloop; case '\r': EmitCarriageReturn(); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.s22_SCRIPT_DATA_ESCAPED; goto breakStateloop; case '\n': default: /* * Anything else Emit the current input * character as a character token. Switch to the * script data escaped state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.s22_SCRIPT_DATA_ESCAPED; goto breakScriptdataescapeddashdashloop; // goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakScriptdataescapeddashdashloop: goto case TokenizerState.s22_SCRIPT_DATA_ESCAPED; } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.s22_SCRIPT_DATA_ESCAPED: /*scriptdataescapedloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Emit a U+002D * HYPHEN-MINUS character token. Switch to the * script data escaped dash state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos); state = TokenizerState.s23_SCRIPT_DATA_ESCAPED_DASH; goto breakScriptdataescapedloop; // FALL THRU // continue // stateloop; case '<': /* * U+003C LESS-THAN SIGN (<) Switch to the * script data escaped less-than sign state. */ FlushChars(); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); state = TokenizerState.s25_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; goto continueStateloop; case '\u0000': EmitReplacementCharacter(); continue; case '\r': EmitCarriageReturn(); goto breakStateloop; case '\n': default: /* * Anything else Emit the current input * character as a character token. Stay in the * script data escaped state. */ continue; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakScriptdataescapedloop: goto case TokenizerState.s23_SCRIPT_DATA_ESCAPED_DASH; } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.s23_SCRIPT_DATA_ESCAPED_DASH: /*scriptdataescapeddashloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Emit a U+002D * HYPHEN-MINUS character token. Switch to the * script data escaped dash dash state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); state = TokenizerState.s24_SCRIPT_DATA_ESCAPED_DASH_DASH; goto continueStateloop; case '<': /* * U+003C LESS-THAN SIGN (<) Switch to the * script data escaped less-than sign state. */ FlushChars(); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); state = TokenizerState.s25_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; goto breakScriptdataescapeddashloop; // goto continueStateloop; case '\u0000': EmitReplacementCharacter(); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.s22_SCRIPT_DATA_ESCAPED; goto continueStateloop; case '\r': EmitCarriageReturn(); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.s22_SCRIPT_DATA_ESCAPED; goto breakStateloop; case '\n': default: /* * Anything else Emit the current input * character as a character token. Switch to the * script data escaped state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.s22_SCRIPT_DATA_ESCAPED; goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakScriptdataescapeddashloop: goto case TokenizerState.s25_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.s25_SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: /*scriptdataescapedlessthanloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '/': /* * U+002F SOLIDUS (/) Set the temporary buffer * to the empty string. Switch to the script * data escaped end tag open state. */ index = 0; ClearStrBuf(); returnState = TokenizerState.s22_SCRIPT_DATA_ESCAPED; //state = Transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); state = TokenizerState.NON_DATA_END_TAG_NAME; goto continueStateloop; case 'S': case 's': /* * U+0041 LATIN CAPITAL LETTER A through to * U+005A LATIN CAPITAL LETTER Z Emit a U+003C * LESS-THAN SIGN character token and the * current input character as a character token. */ TokenListener.Characters(LT_GT, 0, 1); reader.StartCollect(); index = 1; /* * Set the temporary buffer to the empty string. * Append the lowercase TokenizerState.version of the current * input character (add 0x0020 to the * character's code point) to the temporary * buffer. Switch to the script data double * escape start state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos); state = TokenizerState.s28_SCRIPT_DATA_DOUBLE_ESCAPE_START; goto breakScriptdataescapedlessthanloop; // goto continueStateloop; default: /* * Anything else Emit a U+003C LESS-THAN SIGN * character token and reconsume the current * input character in the script data escaped * state. */ TokenListener.Characters(LT_GT, 0, 1); reader.StartCollect(); //reconsume = true; reader.StepBack(); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.s22_SCRIPT_DATA_ESCAPED; goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakScriptdataescapedlessthanloop: goto case TokenizerState.s28_SCRIPT_DATA_DOUBLE_ESCAPE_START; } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.s28_SCRIPT_DATA_DOUBLE_ESCAPE_START: /*scriptdatadoubleescapestartloop:*/ { char c; while (reader.ReadNext(out c)) { Debug.Assert(index > 0); if (index < 6) { // SCRIPT_ARR.Length char folded = c; if (c >= 'A' && c <= 'Z') { //make it lower case folded += (char)0x20; } if (folded != Tokenizer.SCRIPT_ARR[index]) { //reconsume = true; reader.StepBack(); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.s22_SCRIPT_DATA_ESCAPED; goto continueStateloop; } index++; continue; } switch (c) { case '\r': EmitCarriageReturn(); //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.s29_SCRIPT_DATA_DOUBLE_ESCAPED; goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': case '/': case '>': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN * (>) Emit the current input character as a * character token. If the temporary buffer is * the string "script", then switch to the * script data double escaped state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.s29_SCRIPT_DATA_DOUBLE_ESCAPED; goto breakScriptdatadoubleescapestartloop; // goto continueStateloop; default: /* * Anything else Reconsume the current input * character in the script data escaped state. */ //reconsume = true; reader.StepBack(); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.s22_SCRIPT_DATA_ESCAPED; goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakScriptdatadoubleescapestartloop: goto case TokenizerState.s29_SCRIPT_DATA_DOUBLE_ESCAPED; } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.s29_SCRIPT_DATA_DOUBLE_ESCAPED: /*scriptdatadoubleescapedloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Emit a U+002D * HYPHEN-MINUS character token. Switch to the * script data double escaped dash state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos); state = TokenizerState.s30_SCRIPT_DATA_DOUBLE_ESCAPED_DASH; goto breakScriptdatadoubleescapedloop; // FALL THRU // continue // stateloop; case '<': /* * U+003C LESS-THAN SIGN (<) Emit a U+003C * LESS-THAN SIGN character token. Switch to the * script data double escaped less-than sign * state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); state = TokenizerState.s32_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; goto continueStateloop; case '\u0000': EmitReplacementCharacter(); continue; case '\r': EmitCarriageReturn(); goto breakStateloop; case '\n': default: /* * Anything else Emit the current input * character as a character token. Stay in the * script data double escaped state. */ continue; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakScriptdatadoubleescapedloop: goto case TokenizerState.s30_SCRIPT_DATA_DOUBLE_ESCAPED_DASH; } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.s30_SCRIPT_DATA_DOUBLE_ESCAPED_DASH: /*scriptdatadoubleescapeddashloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Emit a U+002D * HYPHEN-MINUS character token. Switch to the * script data double escaped dash dash state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos); state = TokenizerState.s31_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH; goto breakScriptdatadoubleescapeddashloop; // goto continueStateloop; case '<': /* * U+003C LESS-THAN SIGN (<) Emit a U+003C * LESS-THAN SIGN character token. Switch to the * script data double escaped less-than sign * state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); state = TokenizerState.s32_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; goto continueStateloop; case '\u0000': EmitReplacementCharacter(); //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.s29_SCRIPT_DATA_DOUBLE_ESCAPED; goto continueStateloop; case '\r': EmitCarriageReturn(); //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.s29_SCRIPT_DATA_DOUBLE_ESCAPED; goto breakStateloop; case '\n': default: /* * Anything else Emit the current input * character as a character token. Switch to the * script data double escaped state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.s29_SCRIPT_DATA_DOUBLE_ESCAPED; goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakScriptdatadoubleescapeddashloop: goto case TokenizerState.s31_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH; } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.s31_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: /*scriptdatadoubleescapeddashdashloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Emit a U+002D * HYPHEN-MINUS character token. Stay in the * script data double escaped dash dash state. */ continue; case '<': /* * U+003C LESS-THAN SIGN (<) Emit a U+003C * LESS-THAN SIGN character token. Switch to the * script data double escaped less-than sign * state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); state = TokenizerState.s32_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; goto breakScriptdatadoubleescapeddashdashloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit a U+003E * GREATER-THAN SIGN character token. Switch to * the script data state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); state = TokenizerState.s06_SCRIPT_DATA; goto continueStateloop; case '\u0000': EmitReplacementCharacter(); //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.s29_SCRIPT_DATA_DOUBLE_ESCAPED; goto continueStateloop; case '\r': EmitCarriageReturn(); //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.s29_SCRIPT_DATA_DOUBLE_ESCAPED; goto breakStateloop; case '\n': default: /* * Anything else Emit the current input * character as a character token. Switch to the * script data double escaped state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.s29_SCRIPT_DATA_DOUBLE_ESCAPED; goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakScriptdatadoubleescapeddashdashloop: goto case TokenizerState.s32_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.s32_SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: /*scriptdatadoubleescapedlessthanloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '/': /* * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS * character token. Set the temporary buffer to * the empty string. Switch to the script data * double escape end state. */ index = 0; //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos); state = TokenizerState.s33_SCRIPT_DATA_DOUBLE_ESCAPE_END; goto breakScriptdatadoubleescapedlessthanloop; default: /* * Anything else Reconsume the current input * character in the script data double escaped * state. */ //reconsume = true; reader.StepBack(); //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.s29_SCRIPT_DATA_DOUBLE_ESCAPED; goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakScriptdatadoubleescapedlessthanloop: goto case TokenizerState.s33_SCRIPT_DATA_DOUBLE_ESCAPE_END; } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.s33_SCRIPT_DATA_DOUBLE_ESCAPE_END: /*scriptdatadoubleescapeendloop:*/ { char c; while (reader.ReadNext(out c)) { if (index < 6) { // SCRIPT_ARR.Length char folded = c; if (c >= 'A' && c <= 'Z') { folded += (char)0x20; } if (folded != Tokenizer.SCRIPT_ARR[index]) { reader.StepBack(); //reconsume = true; //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.s29_SCRIPT_DATA_DOUBLE_ESCAPED; goto continueStateloop; } index++; continue; } switch (c) { case '\r': EmitCarriageReturn(); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.s22_SCRIPT_DATA_ESCAPED; goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': case '/': case '>': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN * (>) Emit the current input character as a * character token. If the temporary buffer is * the string "script", then switch to the * script data escaped state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.s22_SCRIPT_DATA_ESCAPED; goto continueStateloop; default: /* * Reconsume the current input character in the * script data double escaped state. */ //reconsume = true; reader.StepBack(); //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.s29_SCRIPT_DATA_DOUBLE_ESCAPED; goto continueStateloop; } } } //------------------------------------ //eof goto breakStateloop; // END HOTSPOT WORKAROUND } } // stateloop breakStateloop: //FlushChars(buf, pos); FlushChars(); /* * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; } */ // Save locals stateSave = state; returnStateSave = returnState; }
void EmitOrAppendOne(char[] val, TokenizerState returnState) { if (((byte)returnState & DATA_AND_RCDATA_MASK) == 0) { AppendLongStrBuf(val[0]); } else { TokenListener.Characters(val, 0, 1); } }
/// <summary> /// Initializes a new instance of the <see cref="UnclosedStringLiteralException"/> class. /// </summary> /// <param name="tokenizerState">State of the tokenizer.</param> /// <param name="batchSource">The batch source.</param> internal UnclosedStringLiteralException(TokenizerState tokenizerState, IBatchSource batchSource) : base(tokenizerState, batchSource) { }
private void StartToken(TokenizerState tokenizerState, string character = "") { _currentTokenizerState = tokenizerState; _currentToken.Text += character; }
internal static bool TryWrite(XmlWriter writer, T instance, TokenizerArgs args) { if (writer == null) { throw new ArgumentNullException("writer"); } else if (instance == null) { throw new ArgumentNullException("instance"); } else if (args == null) { throw new ArgumentNullException("args"); } Hashtable written = new Hashtable(); using (TokenizerState <T> state = Tokenizer.NewState <T>(args, instance)) { // Step 1: Try to write tokens as attributes foreach (TokenMember member in state.Definition.AllTokenMembers) { object[] values = member.GetValues(state); if (member.Tokens.Count > 0 && member.Groups.Count > 0) { continue; // Write the members as element } if ((values == null) || (values.Length == 0) || values.Length > 1) { continue; } else if (member.Tokens.Count <= 0) { continue; } written[member] = member; foreach (object value in values) { if (value == null) { continue; } Type type = value.GetType(); foreach (TokenItem ti in member.Tokens) { if (ti.Name == null) { continue; } if (ti.ValueType != null && !ti.ValueType.IsAssignableFrom(type)) { continue; } // Will throw if multiple times written -> Definition bug, resolve there writer.WriteAttributeString(ti.Name, ti.GetStringValue(value, state)); break; } } } // Step 2: Write tokengroups and members with multiple values foreach (TokenMember member in state.Definition.AllTokenMembers) { if (written.Contains(member)) { continue; } object[] values = member.GetValues(state); if ((values == null) || (values.Length == 0)) { continue; } foreach (object value in values) { if (value == null) { continue; } Type type = value.GetType(); bool writtenItem = false; foreach (TokenGroupItem tg in member.Groups) { if (tg.ValueType != null && !tg.ValueType.IsAssignableFrom(type)) { continue; } writer.WriteStartElement(tg.Name); // Will throw if multiple times written -> Definition bug, resolve there if (!tg.TryWriteXml(writer, args.Clone(state.Instance), value)) { return(false); } writer.WriteEndElement(); writtenItem = true; break; } if (!writtenItem) { foreach (TokenItem ti in member.Tokens) { if (ti.Name == null) { continue; } if (ti.ValueType != null && !ti.ValueType.IsAssignableFrom(type)) { continue; } // Will throw if multiple times written -> Definition bug, resolve there writer.WriteElementString(ti.Name, ti.GetStringValue(value, state)); break; } } } } } return(true); }
internal static bool TryParse(IXPathNavigable element, TokenizerArgs args, out T to) { XPathNavigator nav = element.CreateNavigator(); to = null; using (TokenizerState <T> state = Tokenizer.NewState <T>(args)) { if (nav.MoveToFirstAttribute()) { do { TokenItem ti; if (!state.Definition.TryGetToken(nav.LocalName, args.CaseSensitive, out ti)) { if (args.SkipUnknownNamedItems) { continue; } else { return(false); } } ti.Evaluate(nav.Value, state); }while (nav.MoveToNextAttribute()); nav.MoveToParent(); } if (nav.HasChildren) { if (nav.MoveToFirstChild()) { do { string name = nav.LocalName; TokenGroupItem group; TokenItem ti; if (state.Definition.TryGetGroup(name, args.CaseSensitive, out group)) { object value; if (!group.TryParseXml(nav, args.Clone(state.Instance), out value)) { return(false); } group.Member.SetValue(state, value); } else if (state.Definition.TryGetToken(name, args.CaseSensitive, out ti)) { // Allow tokens as element ti.Evaluate(nav.Value, state); } else if (!args.SkipUnknownNamedItems) { return(false); } }while (nav.MoveToNext(XPathNodeType.Element)); } } to = state.Instance; return(true); } }
public static List <Token> Tokenize(string buffer) { TokenizerOutput ctx = new TokenizerOutput(); TokenizerState state = new TokenizerState(buffer); while (!state.IsEndOfStream()) { while (!state.IsEndOfStream() && state.GetChar() != '\n' && char.IsWhiteSpace(state.GetChar())) { state.NextChar(); } if (state.IsEndOfStream()) { break; } char c = state.GetChar(); switch (c) { case '\n': state.NextLine(); state.NextChar(); break; case '(': ctx.AddSymbol(TokenType.BraceBegin, c, state.CreateInfo()); state.NextChar(); break; case ')': ctx.AddSymbol(TokenType.BraceEnd, c, state.CreateInfo()); state.NextChar(); break; case ',': ctx.AddSymbol(TokenType.ArgumentSeparator, c, state.CreateInfo()); state.NextChar(); break; case '*': ctx.AddSymbol(TokenType.Pointer, c, state.CreateInfo()); state.NextChar(); break; default: if (char.IsLetter(c) || c == '_') { int start = state.BufferPos; while (!state.IsEndOfStream() && (char.IsLetterOrDigit(state.GetChar()) || (state.GetChar() == '_'))) { state.NextChar(); } int len = state.BufferPos - start; string ident = buffer.Substring(start, len); ctx.AddIdent(ident, state.CreateInfo()); } else { ctx.AddChar(c, state.CreateInfo()); state.NextChar(); } break; } } return(ctx.Tokens); }
// ]NOCPP] // For the token handler to call /** * Sets the tokenizer state and the associated element name. This should * only ever used to put the tokenizer into one of the states that have * a special end tag expectation. * * @param specialTokenizerState * the tokenizer state to set * @param endTagExpectation * the expected end tag for transitioning back to normal */ public void SetStateAndEndTagExpectation(TokenizerState specialTokenizerState, [Local] String endTagExpectation) { this.stateSave = specialTokenizerState; if (specialTokenizerState == TokenizerState.s01_DATA) { return; } this.endTagExpectation = ElementName.ElementNameByBuffer(endTagExpectation.ToCharArray()); EndTagExpectationToArray(); }
private int StateLoop(TokenizerState state, char c, int pos, char[] buf, bool reconsume, TokenizerState returnState, int endPos) { /* * Idioms used in this code: * * * Consuming the next input character * * To consume the next input character, the code does this: if (++pos == * endPos) { goto breakStateloop; } c = buf[pos]; * * * Staying in a state * * When there's a state that the tokenizer may stay in over multiple * input characters, the state has a wrapper |for(;;)| loop and staying * in the state continues the loop. * * * Switching to another state * * To switch to another state, the code sets the state variable to the * magic number of the new state. Then it either continues stateloop or * breaks out of the state's own wrapper loop if the target state is * right after the current state in source order. (This is a partial * workaround for Java's lack of goto.) * * * Reconsume support * * The spec sometimes says that an input character is reconsumed in * another state. If a state can ever be entered so that an input * character can be reconsumed in it, the state's code starts with an * |if (reconsume)| that sets reconsume to false and skips over the * normal code for consuming a new character. * * To reconsume the current character in another state, the code sets * |reconsume| to true and then switches to the other state. * * * Emitting character tokens * * This method emits character tokens lazily. Whenever a new range of * character tokens starts, the field cstart must be set to the start * index of the range. The flushChars() method must be called at the end * of a range to flush it. * * * U+0000 handling * * The various states have to handle the replacement of U+0000 with * U+FFFD. However, if U+0000 would be reconsumed in another state, the * replacement doesn't need to happen, because it's handled by the * reconsuming state. * * * LF handling * * Every state needs to increment the line number upon LF unless the LF * gets reconsumed by another state which increments the line number. * * * CR handling * * Every state needs to handle CR unless the CR gets reconsumed and is * handled by the reconsuming state. The CR needs to be handled as if it * were and LF, the lastCR field must be set to true and then this * method must return. The IO driver will then swallow the next * character if it is an LF to coalesce CRLF. */ /* * As there is no support for labeled loops in C#, instead of break <loop>; * the port uses goto break<loop>; and a label after the loop. * Instead of continue <loop>; it uses goto continue<loop>; and a label * at the beginning or end of the loop (which doesn't matter in for(;;) loops) */ /*stateloop:*/ for (; ; ) { continueStateloop: switch (state) { case TokenizerState.DATA: /*dataloop:*/ for (; ; ) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; } switch (c) { case '&': /* * U+0026 AMPERSAND (&) Switch to the character * reference in data state. */ FlushChars(buf, pos); ClearStrBufAndAppend(c); SetAdditionalAndRememberAmpersandLocation('\u0000'); returnState = state; //state = Transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); state = TokenizerState.CONSUME_CHARACTER_REFERENCE; goto continueStateloop; case '<': /* * U+003C LESS-THAN SIGN (<) Switch to the tag * open state. */ FlushChars(buf, pos); //state = Transition(state, Tokenizer.TAG_OPEN, reconsume, pos); state = TokenizerState.TAG_OPEN; goto breakDataloop; // FALL THROUGH continue // stateloop; case '\u0000': EmitReplacementCharacter(buf, pos); continue; case '\r': EmitCarriageReturn(buf, pos); goto breakStateloop; case '\n': SilentLineFeed(); goto default; default: /* * Anything else Emit the input character as a * character token. * * Stay in the data state. */ continue; } } breakDataloop: goto case TokenizerState.TAG_OPEN; // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.TAG_OPEN: /*tagopenloop:*/ for (; ; ) { /* * The behavior of this state depends on the content * model flag. */ if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * If the content model flag is set to the PCDATA state * Consume the next input character: */ if (c >= 'A' && c <= 'Z') { /* * U+0041 LATIN CAPITAL LETTER A through to U+005A * LATIN CAPITAL LETTER Z Create a new start tag * token, */ endTag = false; /* * set its tag name to the lowercase TokenizerState.version of the * input character (add 0x0020 to the character's * code point), */ ClearStrBufAndAppend((char)(c + 0x20)); /* then switch to the tag name state. */ //state = Transition(state, Tokenizer.TAG_NAME, reconsume, pos); state = TokenizerState.TAG_NAME; /* * (Don't emit the token yet; further details will * be filled in before it is emitted.) */ goto breakTagopenloop; // goto continueStateloop; } else if (c >= 'a' && c <= 'z') { /* * U+0061 LATIN SMALL LETTER A through to U+007A * LATIN SMALL LETTER Z Create a new start tag * token, */ endTag = false; /* * set its tag name to the input character, */ ClearStrBufAndAppend(c); /* then switch to the tag name state. */ //state = Transition(state, Tokenizer.TAG_NAME, reconsume, pos); state = TokenizerState.TAG_NAME; /* * (Don't emit the token yet; further details will * be filled in before it is emitted.) */ goto breakTagopenloop; // goto continueStateloop; } switch (c) { case '!': /* * U+0021 EXCLAMATION MARK (!) Switch to the * markup declaration open state. */ //state = Transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos); state = TokenizerState.MARKUP_DECLARATION_OPEN; goto continueStateloop; case '/': /* * U+002F SOLIDUS (/) Switch to the close tag * open state. */ //state = Transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos); state = TokenizerState.CLOSE_TAG_OPEN; goto continueStateloop; case '?': /* * U+003F QUESTION MARK (?) Parse error. */ ErrProcessingInstruction(); /* * Switch to the bogus comment state. */ ClearLongStrBufAndAppend(c); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.BOGUS_COMMENT; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Parse error. */ ErrLtGt(); /* * Emit a U+003C LESS-THAN SIGN character token * and a U+003E GREATER-THAN SIGN character * token. */ TokenHandler.Characters(LT_GT, 0, 2); /* Switch to the data state. */ cstart = pos + 1; //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; default: /* * Anything else Parse error. */ ErrBadCharAfterLt(c); /* * Emit a U+003C LESS-THAN SIGN character token */ TokenHandler.Characters(LT_GT, 0, 1); /* * and reconsume the current input character in * the data state. */ cstart = pos; //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; reconsume = true; goto continueStateloop; } } breakTagopenloop: goto case TokenizerState.TAG_NAME; // FALL THROUGH DON'T REORDER case TokenizerState.TAG_NAME: /*tagnameloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); StrBufToElementNameString(); //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.BEFORE_ATTRIBUTE_NAME; goto breakStateloop; case '\n': SilentLineFeed(); goto case ' '; case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * Switch to the before attribute name state. */ StrBufToElementNameString(); //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.BEFORE_ATTRIBUTE_NAME; goto breakTagnameloop; // goto continueStateloop; case '/': /* * U+002F SOLIDUS (/) Switch to the self-closing * start tag state. */ StrBufToElementNameString(); //state = Transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); state = TokenizerState.SELF_CLOSING_START_TAG; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * tag token. */ StrBufToElementNameString(); //state = Transition(state, EmitCurrentTagToken(false, pos), reconsume, pos); state = EmitCurrentTagToken(false, pos); if (shouldSuspend) { goto breakStateloop; } /* * Switch to the data state. */ goto continueStateloop; case '\u0000': c = '\uFFFD'; goto default; // fall thru default: if (c >= 'A' && c <= 'Z') { /* * U+0041 LATIN CAPITAL LETTER A through to * U+005A LATIN CAPITAL LETTER Z Append the * lowercase TokenizerState.version of the current input * character (add 0x0020 to the character's * code point) to the current tag token's * tag name. */ c += (char)0x20; } /* * Anything else Append the current input * character to the current tag token's tag * name. */ AppendStrBuf(c); /* * Stay in the tag name state. */ continue; } } breakTagnameloop: goto case TokenizerState.BEFORE_ATTRIBUTE_NAME; // FALLTHRU DON'T REORDER case TokenizerState.BEFORE_ATTRIBUTE_NAME: /*beforeattributenameloop:*/ for (; ; ) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; } /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': SilentLineFeed(); // fall thru goto case ' '; case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay * in the before attribute name state. */ continue; case '/': /* * U+002F SOLIDUS (/) Switch to the self-closing * start tag state. */ //state = Transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); state = TokenizerState.SELF_CLOSING_START_TAG; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * tag token. */ //state = Transition(state, EmitCurrentTagToken(false, pos), reconsume, pos); state = EmitCurrentTagToken(false, pos); if (shouldSuspend) { goto breakStateloop; } /* * Switch to the data state. */ goto continueStateloop; case '\u0000': c = '\uFFFD'; // fall thru goto case '\"'; case '\"': case '\'': case '<': case '=': /* * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS * SIGN (=) Parse error. */ ErrBadCharBeforeAttributeNameOrNull(c); /* * Treat it as per the "anything else" entry * below. */ goto default; default: /* * Anything else Start a new attribute in the * current tag token. */ if (c >= 'A' && c <= 'Z') { /* * U+0041 LATIN CAPITAL LETTER A through to * U+005A LATIN CAPITAL LETTER Z Set that * attribute's name to the lowercase TokenizerState.version * of the current input character (add * 0x0020 to the character's code point) */ c += (char)0x20; } /* * Set that attribute's name to the current * input character, */ ClearStrBufAndAppend(c); /* * and its value to the empty string. */ // Will do later. /* * Switch to the attribute name state. */ //state = Transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.ATTRIBUTE_NAME; goto breakBeforeattributenameloop; // goto continueStateloop; } } breakBeforeattributenameloop: goto case TokenizerState.ATTRIBUTE_NAME; // FALLTHRU DON'T REORDER case TokenizerState.ATTRIBUTE_NAME: /*attributenameloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); AttributeNameComplete(); //state = Transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.AFTER_ATTRIBUTE_NAME; goto breakStateloop; case '\n': SilentLineFeed(); goto case ' '; // fall thru case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * Switch to the after attribute name state. */ AttributeNameComplete(); //state = Transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.AFTER_ATTRIBUTE_NAME; goto continueStateloop; case '/': /* * U+002F SOLIDUS (/) Switch to the self-closing * start tag state. */ AttributeNameComplete(); AddAttributeWithoutValue(); //state = Transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); state = TokenizerState.SELF_CLOSING_START_TAG; goto continueStateloop; case '=': /* * U+003D EQUALS SIGN (=) Switch to the before * attribute value state. */ AttributeNameComplete(); //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); state = TokenizerState.BEFORE_ATTRIBUTE_VALUE; goto breakAttributenameloop; // goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * tag token. */ AttributeNameComplete(); AddAttributeWithoutValue(); //state = Transition(state, EmitCurrentTagToken(false, pos), reconsume, pos); state = EmitCurrentTagToken(false, pos); if (shouldSuspend) { goto breakStateloop; } /* * Switch to the data state. */ goto continueStateloop; case '\u0000': c = '\uFFFD'; // fall thru goto case '\"'; case '\"': case '\'': case '<': /* * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE * (') U+003C LESS-THAN SIGN (<) Parse error. */ ErrQuoteOrLtInAttributeNameOrNull(c); /* * Treat it as per the "anything else" entry * below. */ goto default; default: if (c >= 'A' && c <= 'Z') { /* * U+0041 LATIN CAPITAL LETTER A through to * U+005A LATIN CAPITAL LETTER Z Append the * lowercase TokenizerState.version of the current input * character (add 0x0020 to the character's * code point) to the current attribute's * name. */ c += (char)0x20; } /* * Anything else Append the current input * character to the current attribute's name. */ AppendStrBuf(c); /* * Stay in the attribute name state. */ continue; } } breakAttributenameloop: goto case TokenizerState.BEFORE_ATTRIBUTE_VALUE; // FALLTHRU DON'T REORDER case TokenizerState.BEFORE_ATTRIBUTE_VALUE: /*beforeattributevalueloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': SilentLineFeed(); goto case ' '; // fall thru case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay * in the before attribute value state. */ continue; case '"': /* * U+0022 QUOTATION MARK (") Switch to the * attribute value (double-quoted) state. */ ClearLongStrBuf(); //state = Transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos); state = TokenizerState.ATTRIBUTE_VALUE_DOUBLE_QUOTED; goto breakBeforeattributevalueloop; // goto continueStateloop; case '&': /* * U+0026 AMPERSAND (&) Switch to the attribute * value (unquoted) state and reconsume this * input character. */ ClearLongStrBuf(); //state = Transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); state = TokenizerState.ATTRIBUTE_VALUE_UNQUOTED; NoteUnquotedAttributeValue(); reconsume = true; goto continueStateloop; case '\'': /* * U+0027 APOSTROPHE (') Switch to the attribute * value (single-quoted) state. */ ClearLongStrBuf(); //state = Transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos); state = TokenizerState.ATTRIBUTE_VALUE_SINGLE_QUOTED; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Parse error. */ ErrAttributeValueMissing(); /* * Emit the current tag token. */ AddAttributeWithoutValue(); //state = Transition(state, EmitCurrentTagToken(false, pos), reconsume, pos); state = EmitCurrentTagToken(false, pos); if (shouldSuspend) { goto breakStateloop; } /* * Switch to the data state. */ goto continueStateloop; case '\u0000': c = '\uFFFD'; // fall thru goto case '<'; case '<': case '=': case '`': /* * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN * (=) U+0060 GRAVE ACCENT (`) */ ErrLtOrEqualsOrGraveInUnquotedAttributeOrNull(c); /* * Treat it as per the "anything else" entry * below. */ goto default; default: // [NOCPP[ ErrHtml4NonNameInUnquotedAttribute(c); // ]NOCPP] /* * Anything else Append the current input * character to the current attribute's value. */ ClearLongStrBufAndAppend(c); /* * Switch to the attribute value (unquoted) * state. */ //state = Transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); state = TokenizerState.ATTRIBUTE_VALUE_UNQUOTED; NoteUnquotedAttributeValue(); goto continueStateloop; } } breakBeforeattributevalueloop: goto case TokenizerState.ATTRIBUTE_VALUE_DOUBLE_QUOTED; // FALLTHRU DON'T REORDER case TokenizerState.ATTRIBUTE_VALUE_DOUBLE_QUOTED: /*attributevaluedoublequotedloop:*/ for (; ; ) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; } /* * Consume the next input character: */ switch (c) { case '"': /* * U+0022 QUOTATION MARK (") Switch to the after * attribute value (quoted) state. */ AddAttributeWithValue(); //state = Transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); state = TokenizerState.AFTER_ATTRIBUTE_VALUE_QUOTED; goto breakAttributevaluedoublequotedloop; // goto continueStateloop; case '&': /* * U+0026 AMPERSAND (&) Switch to the character * reference in attribute value state, with the * additional allowed character being U+0022 * QUOTATION MARK ("). */ ClearStrBufAndAppend(c); SetAdditionalAndRememberAmpersandLocation('\"'); returnState = state; //state = Transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); state = TokenizerState.CONSUME_CHARACTER_REFERENCE; goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); continue; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Anything else Append the current input * character to the current attribute's value. */ AppendLongStrBuf(c); /* * Stay in the attribute value (double-quoted) * state. */ continue; } } breakAttributevaluedoublequotedloop: goto case TokenizerState.AFTER_ATTRIBUTE_VALUE_QUOTED; // FALLTHRU DON'T REORDER case TokenizerState.AFTER_ATTRIBUTE_VALUE_QUOTED: /*afterattributevaluequotedloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.BEFORE_ATTRIBUTE_NAME; goto breakStateloop; case '\n': SilentLineFeed(); // fall thru goto case ' '; case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * Switch to the before attribute name state. */ //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.BEFORE_ATTRIBUTE_NAME; goto continueStateloop; case '/': /* * U+002F SOLIDUS (/) Switch to the self-closing * start tag state. */ //state = Transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); state = TokenizerState.SELF_CLOSING_START_TAG; goto breakAfterattributevaluequotedloop; // goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * tag token. */ //state = Transition(state, EmitCurrentTagToken(false, pos), reconsume, pos); state = EmitCurrentTagToken(false, pos); if (shouldSuspend) { goto breakStateloop; } /* * Switch to the data state. */ goto continueStateloop; default: /* * Anything else Parse error. */ ErrNoSpaceBetweenAttributes(); /* * Reconsume the character in the before * attribute name state. */ //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.BEFORE_ATTRIBUTE_NAME; reconsume = true; goto continueStateloop; } } breakAfterattributevaluequotedloop: goto case TokenizerState.SELF_CLOSING_START_TAG; // FALLTHRU DON'T REORDER case TokenizerState.SELF_CLOSING_START_TAG: if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '>': /* * U+003E GREATER-THAN SIGN (>) Set the self-closing * flag of the current tag token. Emit the current * tag token. */ // [NOCPP[ ErrHtml4XmlVoidSyntax(); // ]NOCPP] //state = Transition(state, EmitCurrentTagToken(true, pos), reconsume, pos); state = EmitCurrentTagToken(true, pos); if (shouldSuspend) { goto breakStateloop; } /* * Switch to the data state. */ goto continueStateloop; default: /* Anything else Parse error. */ ErrSlashNotFollowedByGt(); /* * Reconsume the character in the before attribute * name state. */ //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.BEFORE_ATTRIBUTE_NAME; reconsume = true; goto continueStateloop; } // XXX reorder point case TokenizerState.ATTRIBUTE_VALUE_UNQUOTED: for (; ; ) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; } /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); AddAttributeWithValue(); //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.BEFORE_ATTRIBUTE_NAME; goto breakStateloop; case '\n': SilentLineFeed(); // fall thru goto case ' '; case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * Switch to the before attribute name state. */ AddAttributeWithValue(); //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.BEFORE_ATTRIBUTE_NAME; goto continueStateloop; case '&': /* * U+0026 AMPERSAND (&) Switch to the character * reference in attribute value state, with the * additional allowed character being U+003E * GREATER-THAN SIGN (>) */ ClearStrBufAndAppend(c); SetAdditionalAndRememberAmpersandLocation('>'); returnState = state; //state = Transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); state = TokenizerState.CONSUME_CHARACTER_REFERENCE; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * tag token. */ AddAttributeWithValue(); //state = Transition(state, EmitCurrentTagToken(false, pos), reconsume, pos); state = EmitCurrentTagToken(false, pos); if (shouldSuspend) { goto breakStateloop; } /* * Switch to the data state. */ goto continueStateloop; case '\u0000': c = '\uFFFD'; goto case '<'; // fall thru case '<': case '\"': case '\'': case '=': case '`': /* * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error. */ ErrUnquotedAttributeValOrNull(c); /* * Treat it as per the "anything else" entry * below. */ // fall through goto default; default: // [NOCPP] ErrHtml4NonNameInUnquotedAttribute(c); // ]NOCPP] /* * Anything else Append the current input * character to the current attribute's value. */ AppendLongStrBuf(c); /* * Stay in the attribute value (unquoted) state. */ continue; } } // XXX reorder point case TokenizerState.AFTER_ATTRIBUTE_NAME: for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': SilentLineFeed(); // fall thru goto case ' '; case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay * in the after attribute name state. */ continue; case '/': /* * U+002F SOLIDUS (/) Switch to the self-closing * start tag state. */ AddAttributeWithoutValue(); //state = Transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); state = TokenizerState.SELF_CLOSING_START_TAG; goto continueStateloop; case '=': /* * U+003D EQUALS SIGN (=) Switch to the before * attribute value state. */ //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); state = TokenizerState.BEFORE_ATTRIBUTE_VALUE; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * tag token. */ AddAttributeWithoutValue(); //state = Transition(state, EmitCurrentTagToken(false, pos), reconsume, pos); state = EmitCurrentTagToken(false, pos); if (shouldSuspend) { goto breakStateloop; } /* * Switch to the data state. */ goto continueStateloop; case '\u0000': c = '\uFFFD'; goto case '\"'; // fall thru case '\"': case '\'': case '<': ErrQuoteOrLtInAttributeNameOrNull(c); /* * Treat it as per the "anything else" entry * below. */ goto default; default: AddAttributeWithoutValue(); /* * Anything else Start a new attribute in the * current tag token. */ if (c >= 'A' && c <= 'Z') { /* * U+0041 LATIN CAPITAL LETTER A through to * U+005A LATIN CAPITAL LETTER Z Set that * attribute's name to the lowercase TokenizerState.version * of the current input character (add * 0x0020 to the character's code point) */ c += (char)0x20; } /* * Set that attribute's name to the current * input character, */ ClearStrBufAndAppend(c); /* * and its value to the empty string. */ // Will do later. /* * Switch to the attribute name state. */ //state = Transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.ATTRIBUTE_NAME; goto continueStateloop; } } // XXX reorder point case TokenizerState.MARKUP_DECLARATION_OPEN: /*markupdeclarationopenloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * If the next two characters are both U+002D * HYPHEN-MINUS characters (-), consume those two * characters, create a comment token whose data is the * empty string, and switch to the comment start state. * * Otherwise, if the next seven characters are an ASCII * case-insensitive match for the word "DOCTYPE", then * consume those characters and switch to the DOCTYPE * state. * * Otherwise, if the insertion mode is * "in foreign content" and the current node is not an * element in the HTML namespace and the next seven * characters are an case-sensitive match for the string * "[CDATA[" (the five uppercase TokenizerState.letters "CDATA" with a * U+005B LEFT SQUARE BRACKET character before and * after), then consume those characters and switch to * the CDATA section state. * * Otherwise, is is a parse error. Switch to the bogus * comment state. The next character that is consumed, * if any, is the first character that will be in the * comment. */ switch (c) { case '-': ClearLongStrBufAndAppend(c); //state = Transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos); state = TokenizerState.MARKUP_DECLARATION_HYPHEN; goto breakMarkupdeclarationopenloop; // goto continueStateloop; case 'd': case 'D': ClearLongStrBufAndAppend(c); index = 0; //state = Transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos); state = TokenizerState.MARKUP_DECLARATION_OCTYPE; goto continueStateloop; case '[': if (TokenHandler.IsCDataSectionAllowed) { ClearLongStrBufAndAppend(c); index = 0; //state = Transition(state, Tokenizer.CDATA_START, reconsume, pos); state = TokenizerState.CDATA_START; goto continueStateloop; } else { // else fall through goto default; } default: ErrBogusComment(); ClearLongStrBuf(); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.BOGUS_COMMENT; reconsume = true; goto continueStateloop; } } breakMarkupdeclarationopenloop: goto case TokenizerState.MARKUP_DECLARATION_HYPHEN; // FALLTHRU DON'T REORDER case TokenizerState.MARKUP_DECLARATION_HYPHEN: /*markupdeclarationhyphenloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; switch (c) { case '\u0000': goto breakStateloop; case '-': ClearLongStrBuf(); //state = Transition(state, Tokenizer.COMMENT_START, reconsume, pos); state = TokenizerState.COMMENT_START; goto breakMarkupdeclarationhyphenloop; // goto continueStateloop; default: ErrBogusComment(); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.BOGUS_COMMENT; reconsume = true; goto continueStateloop; } } breakMarkupdeclarationhyphenloop: goto case TokenizerState.COMMENT_START; // FALLTHRU DON'T REORDER case TokenizerState.COMMENT_START: /*commentstartloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Comment start state * * * Consume the next input character: */ switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Switch to the comment * start dash state. */ AppendLongStrBuf(c); //state = Transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos); state = TokenizerState.COMMENT_START_DASH; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Parse error. */ ErrPrematureEndOfComment(); /* Emit the comment token. */ EmitComment(0, pos); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); // state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.COMMENT; goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.COMMENT; goto breakCommentstartloop; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Anything else Append the input character to * the comment token's data. */ AppendLongStrBuf(c); /* * Switch to the comment state. */ //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.COMMENT; goto breakCommentstartloop; // goto continueStateloop; } } breakCommentstartloop: goto case TokenizerState.COMMENT; // FALLTHRU DON'T REORDER case TokenizerState.COMMENT: /*commentloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Comment state Consume the next input character: */ switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Switch to the comment * end dash state */ AppendLongStrBuf(c); //state = Transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); state = TokenizerState.COMMENT_END_DASH; goto breakCommentloop; // goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); continue; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Anything else Append the input character to * the comment token's data. */ AppendLongStrBuf(c); /* * Stay in the comment state. */ continue; } } breakCommentloop: goto case TokenizerState.COMMENT_END_DASH; // FALLTHRU DON'T REORDER case TokenizerState.COMMENT_END_DASH: /*commentenddashloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Comment end dash state Consume the next input * character: */ switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Switch to the comment * end state */ AppendLongStrBuf(c); //state = Transition(state, Tokenizer.COMMENT_END, reconsume, pos); state = TokenizerState.COMMENT_END; goto breakCommentenddashloop; // goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.COMMENT; goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.COMMENT; goto continueStateloop; case '\u0000': c = '\uFFFD'; goto default; // fall thru default: /* * Anything else Append a U+002D HYPHEN-MINUS * (-) character and the input character to the * comment token's data. */ AppendLongStrBuf(c); /* * Switch to the comment state. */ //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.COMMENT; goto continueStateloop; } } breakCommentenddashloop: goto case TokenizerState.COMMENT_END; // FALLTHRU DON'T REORDER case TokenizerState.COMMENT_END: /*commentendloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Comment end dash state Consume the next input * character: */ switch (c) { case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the comment * token. */ EmitComment(2, pos); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; case '-': /* U+002D HYPHEN-MINUS (-) Parse error. */ /* * Append a U+002D HYPHEN-MINUS (-) character to * the comment token's data. */ AdjustDoubleHyphenAndAppendToLongStrBufAndErr(c); /* * Stay in the comment end state. */ continue; case '\r': AdjustDoubleHyphenAndAppendToLongStrBufCarriageReturn(); //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.COMMENT; goto breakStateloop; case '\n': AdjustDoubleHyphenAndAppendToLongStrBufLineFeed(); //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.COMMENT; goto continueStateloop; case '!': ErrHyphenHyphenBang(); AppendLongStrBuf(c); //state = Transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos); state = TokenizerState.COMMENT_END_BANG; goto continueStateloop; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Append two U+002D HYPHEN-MINUS (-) characters * and the input character to the comment * token's data. */ AdjustDoubleHyphenAndAppendToLongStrBufAndErr(c); /* * Switch to the comment state. */ //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.COMMENT; goto continueStateloop; } } // XXX reorder point case TokenizerState.COMMENT_END_BANG: for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Comment end bang state * * Consume the next input character: */ switch (c) { case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the comment * token. */ EmitComment(3, pos); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; case '-': /* * Append two U+002D HYPHEN-MINUS (-) characters * and a U+0021 EXCLAMATION MARK (!) character * to the comment token's data. */ AppendLongStrBuf(c); /* * Switch to the comment end dash state. */ //state = Transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); state = TokenizerState.COMMENT_END_DASH; goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); continue; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Anything else Append two U+002D HYPHEN-MINUS * (-) characters, a U+0021 EXCLAMATION MARK (!) * character, and the input character to the * comment token's data. Switch to the comment * state. */ AppendLongStrBuf(c); /* * Switch to the comment state. */ //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.COMMENT; goto continueStateloop; } } // XXX reorder point case TokenizerState.COMMENT_START_DASH: if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Comment start dash state * * Consume the next input character: */ switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Switch to the comment end * state */ AppendLongStrBuf(c); //state = Transition(state, Tokenizer.COMMENT_END, reconsume, pos); state = TokenizerState.COMMENT_END; goto continueStateloop; case '>': ErrPrematureEndOfComment(); /* Emit the comment token. */ EmitComment(1, pos); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.COMMENT; goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.COMMENT; goto continueStateloop; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Append a U+002D HYPHEN-MINUS character (-) and * the current input character to the comment * token's data. */ AppendLongStrBuf(c); /* * Switch to the comment state. */ //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.COMMENT; goto continueStateloop; } // XXX reorder point case TokenizerState.CDATA_START: for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; if (index < 6) { // CDATA_LSQB.Length if (c == Tokenizer.CDATA_LSQB[index]) { AppendLongStrBuf(c); } else { ErrBogusComment(); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.BOGUS_COMMENT; reconsume = true; goto continueStateloop; } index++; continue; } else { cstart = pos; // start coalescing //state = Transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); state = TokenizerState.CDATA_SECTION; reconsume = true; break; // FALL THROUGH goto continueStateloop; } } goto case TokenizerState.CDATA_SECTION; // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.CDATA_SECTION: /*cdatasectionloop:*/ for (; ; ) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; } switch (c) { case ']': FlushChars(buf, pos); //state = Transition(state, Tokenizer.CDATA_RSQB, reconsume, pos); state = TokenizerState.CDATA_RSQB; goto breakCdatasectionloop; // FALL THROUGH case '\u0000': EmitReplacementCharacter(buf, pos); continue; case '\r': EmitCarriageReturn(buf, pos); goto breakStateloop; case '\n': SilentLineFeed(); goto default; // fall thru default: continue; } } breakCdatasectionloop: goto case TokenizerState.CDATA_RSQB; // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.CDATA_RSQB: /*cdatarsqb:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; switch (c) { case ']': //state = Transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos); state = TokenizerState.CDATA_RSQB_RSQB; goto breakCdatarsqb; default: TokenHandler.Characters(Tokenizer.RSQB_RSQB, 0, 1); cstart = pos; //state = Transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); state = TokenizerState.CDATA_SECTION; reconsume = true; goto continueStateloop; } } breakCdatarsqb: goto case TokenizerState.CDATA_RSQB_RSQB; // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.CDATA_RSQB_RSQB: if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; switch (c) { case '>': cstart = pos + 1; //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; default: TokenHandler.Characters(Tokenizer.RSQB_RSQB, 0, 2); cstart = pos; //state = Transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); state = TokenizerState.CDATA_SECTION; reconsume = true; goto continueStateloop; } // XXX reorder point case TokenizerState.ATTRIBUTE_VALUE_SINGLE_QUOTED: /*attributevaluesinglequotedloop:*/ for (; ; ) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; } /* * Consume the next input character: */ switch (c) { case '\'': /* * U+0027 APOSTROPHE (') Switch to the after * attribute value (quoted) state. */ AddAttributeWithValue(); //state = Transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); state = TokenizerState.AFTER_ATTRIBUTE_VALUE_QUOTED; goto continueStateloop; case '&': /* * U+0026 AMPERSAND (&) Switch to the character * reference in attribute value state, with the * + additional allowed character being U+0027 * APOSTROPHE ('). */ ClearStrBufAndAppend(c); SetAdditionalAndRememberAmpersandLocation('\''); returnState = state; //state = Transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); state = TokenizerState.CONSUME_CHARACTER_REFERENCE; goto breakAttributevaluesinglequotedloop; // goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); continue; case '\u0000': c = '\uFFFD'; goto default; // fall thru default: /* * Anything else Append the current input * character to the current attribute's value. */ AppendLongStrBuf(c); /* * Stay in the attribute value (double-quoted) * state. */ continue; } } breakAttributevaluesinglequotedloop: goto case TokenizerState.CONSUME_CHARACTER_REFERENCE; // FALLTHRU DON'T REORDER case TokenizerState.CONSUME_CHARACTER_REFERENCE: if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; if (c == '\u0000') { goto breakStateloop; } /* * Unlike the definition is the spec, this state does not * return a value and never requires the caller to * backtrack. This state takes care of emitting characters * or appending to the current attribute value. It also * takes care of that in the case TokenizerState.when consuming the * character reference fails. */ /* * This section defines how to consume a character * reference. This definition is used when parsing character * references in text and in attributes. * * The behavior depends on the identity of the next * character (the one immediately after the U+0026 AMPERSAND * character): */ switch (c) { case ' ': case '\t': case '\n': case '\r': // we'll reconsume! case '\u000C': case '<': case '&': EmitOrAppendStrBuf(returnState); //if ((returnState & DATA_AND_RCDATA_MASK) == 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) != 0) { cstart = pos; } //state = Transition(state, returnState, reconsume, pos); state = returnState; reconsume = true; goto continueStateloop; case '#': /* * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER * SIGN. */ AppendStrBuf('#'); //state = Transition(state, Tokenizer.CONSUME_NCR, reconsume, pos); state = TokenizerState.CONSUME_NCR; goto continueStateloop; default: if (c == additional) { EmitOrAppendStrBuf(returnState); //state = Transition(state, returnState, reconsume, pos); state = returnState; reconsume = true; goto continueStateloop; } if (c >= 'a' && c <= 'z') { firstCharKey = c - 'a' + 26; } else if (c >= 'A' && c <= 'Z') { firstCharKey = c - 'A'; } else { // No match /* * If no match can be made, then this is a parse * error. */ ErrNoNamedCharacterMatch(); EmitOrAppendStrBuf(returnState); //if ((returnState & DATA_AND_RCDATA_MASK) == 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) !=0) { cstart = pos; } //state = Transition(state, returnState, reconsume, pos); state = returnState; reconsume = true; goto continueStateloop; } // Didn't fail yet AppendStrBuf(c); //state = Transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos); state = TokenizerState.CHARACTER_REFERENCE_HILO_LOOKUP; // FALL THROUGH goto continueStateloop; break; } goto case TokenizerState.CHARACTER_REFERENCE_HILO_LOOKUP; // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.CHARACTER_REFERENCE_HILO_LOOKUP: { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; if (c == '\u0000') { goto breakStateloop; } /* * The data structure is as follows: * * HILO_ACCEL is a two-dimensional int array whose major * index corresponds to the second character of the * character reference (code point as index) and the * minor index corresponds to the first character of the * character reference (packed so that A-Z runs from 0 * to 25 and a-z runs from 26 to 51). This layout makes * it easier to use the sparseness of the data structure * to omit parts of it: The second dimension of the * table is null when no character reference starts with * the character corresponding to that row. * * The int value HILO_ACCEL (by these indeces) is zero * if there exists no character reference starting with * that two-letter prefix. Otherwise, the value is an * int that packs two shorts so that the higher short is * the index of the highest character reference name * with that prefix in NAMES and the lower short * corresponds to the index of the lowest character * reference name with that prefix. (It happens that the * first two character reference names share their * prefix so the packed int cannot be 0 by packing the * two shorts.) * * NAMES is an array of byte arrays where each byte * array encodes the name of a character references as * ASCII. The names omit the first two letters of the * name. (Since storing the first two letters would be * redundant with the data contained in HILO_ACCEL.) The * entries are lexically sorted. * * For a given index in NAMES, the same index in VALUES * contains the corresponding expansion as an array of * two UTF-16 code units (either the character and * U+0000 or a suggogate pair). */ int hilo = 0; if (c <= 'z') { int[] row = NamedCharactersAccel.HILO_ACCEL[c]; if (row != null) { hilo = row[firstCharKey]; } } if (hilo == 0) { /* * If no match can be made, then this is a parse * error. */ ErrNoNamedCharacterMatch(); EmitOrAppendStrBuf(returnState); //if ((returnState & DATA_AND_RCDATA_MASK) == 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) != 0) { cstart = pos; } //state = Transition(state, returnState, reconsume, pos); state = returnState; reconsume = true; goto continueStateloop; } // Didn't fail yet AppendStrBuf(c); lo = hilo & 0xFFFF; hi = hilo >> 16; entCol = -1; candidate = -1; strBufMark = 0; //state = Transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos); state = TokenizerState.CHARACTER_REFERENCE_TAIL; // FALL THROUGH goto continueStateloop; goto case TokenizerState.CHARACTER_REFERENCE_TAIL; } case TokenizerState.CHARACTER_REFERENCE_TAIL: /*outer:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; if (c == '\u0000') { goto breakStateloop; } entCol++; /* * Consume the maximum number of characters possible, * with the consumed characters matching one of the * identifiers in the first column of the named * character references table (in a case-sensitive * manner). */ /*loloop:*/ for (; ; ) { if (hi < lo) { goto breakOuter; } if (entCol == NamedCharacters.NAMES[lo].Length) { candidate = lo; strBufMark = strBufLen; lo++; } else if (entCol > NamedCharacters.NAMES[lo].Length) { goto breakOuter; } else if (c > NamedCharacters.NAMES[lo][entCol]) { lo++; } else { goto breakLoloop; } } breakLoloop: /*hiloop:*/ for (; ; ) { if (hi < lo) { goto breakOuter; } if (entCol == NamedCharacters.NAMES[hi].Length) { goto breakHiloop; } if (entCol > NamedCharacters.NAMES[hi].Length) { goto breakOuter; } else if (c < NamedCharacters.NAMES[hi][entCol]) { hi--; } else { goto breakHiloop; } } breakHiloop: if (hi < lo) { goto breakOuter; } AppendStrBuf(c); continue; } breakOuter: if (candidate == -1) { // reconsume deals with CR, LF or nul /* * If no match can be made, then this is a parse error. */ ErrNoNamedCharacterMatch(); EmitOrAppendStrBuf(returnState); //if ((returnState & DATA_AND_RCDATA_MASK) == 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) != 0) { cstart = pos; } //state = Transition(state, returnState, reconsume, pos); state = returnState; reconsume = true; goto continueStateloop; } else { // c can't be CR, LF or nul if we got here string candidateName = NamedCharacters.NAMES[candidate]; if (candidateName.Length == 0 || candidateName[candidateName.Length - 1] != ';') { /* * If the last character matched is not a U+003B * SEMICOLON (;), there is a parse error. */ //if ((returnState & DATA_AND_RCDATA_MASK) != 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) == 0) { /* * If the entity is being consumed as part of an * attribute, and the last character matched is * not a U+003B SEMICOLON (;), */ char ch; if (strBufMark == strBufLen) { ch = c; } else { // if (strBufOffset != -1) { // ch = buf[strBufOffset + strBufMark]; // } else { ch = strBuf[strBufMark]; // } } if (ch == '=' || (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')) { /* * and the next character is either a U+003D * EQUALS SIGN character (=) or in the range * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, * U+0041 LATIN CAPITAL LETTER A to U+005A * LATIN CAPITAL LETTER Z, or U+0061 LATIN * SMALL LETTER A to U+007A LATIN SMALL * LETTER Z, then, for historical reasons, * all the characters that were matched * after the U+0026 AMPERSAND (&) must be * unconsumed, and nothing is returned. */ ErrNoNamedCharacterMatch(); AppendStrBufToLongStrBuf(); //state = Transition(state, returnState, reconsume, pos); state = returnState; reconsume = true; goto continueStateloop; } } //if ((returnState & DATA_AND_RCDATA_MASK) != 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) == 0) { ErrUnescapedAmpersandInterpretedAsCharacterReference(); } else { ErrNotSemicolonTerminated(); } } /* * Otherwise, return a character token for the character * corresponding to the entity name (as given by the * second column of the named character references * table). */ char[] val = NamedCharacters.VALUES[candidate]; if ( // [NOCPP[ val.Length == 1 // ]NOCPP] // CPPONLY: val[1] == 0 ) { EmitOrAppendOne(val, returnState); } else { EmitOrAppendTwo(val, returnState); } // this is so complicated! if (strBufMark < strBufLen) { // if (strBufOffset != -1) { // if ((returnState & (~1)) != 0) { // for (int i = strBufMark; i < strBufLen; i++) { // appendLongStrBuf(buf[strBufOffset + i]); // } // } else { // tokenHandler.Characters(buf, strBufOffset // + strBufMark, strBufLen // - strBufMark); // } // } else { //if ((returnState & DATA_AND_RCDATA_MASK) != 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) == 0) { for (int i = strBufMark; i < strBufLen; i++) { AppendLongStrBuf(strBuf[i]); } } else { TokenHandler.Characters(strBuf, strBufMark, strBufLen - strBufMark); } // } } //if ((returnState & DATA_AND_RCDATA_MASK) == 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) != 0) { cstart = pos; } //state = Transition(state, returnState, reconsume, pos); state = returnState; reconsume = true; goto continueStateloop; /* * If the markup contains I'm ¬it; I tell you, the * entity is parsed as "not", as in, I'm ¬it; I tell * you. But if the markup was I'm ∉ I tell you, * the entity would be parsed as "notin;", resulting in * I'm ∉ I tell you. */ } // XXX reorder point case TokenizerState.CONSUME_NCR: if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; prevValue = -1; value = 0; seenDigits = false; /* * The behavior further depends on the character after the * U+0023 NUMBER SIGN: */ switch (c) { case 'x': case 'X': /* * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL * LETTER X Consume the X. * * Follow the steps below, but using the range of * characters U+0030 DIGIT ZERO through to U+0039 * DIGIT NINE, U+0061 LATIN SMALL LETTER A through * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL * LETTER F (in other words, 0-9, A-F, a-f). * * When it comes to interpreting the number, * interpret it as a hexadecimal number. */ AppendStrBuf(c); //state = Transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos); state = TokenizerState.HEX_NCR_LOOP; goto continueStateloop; default: /* * Anything else Follow the steps below, but using * the range of characters U+0030 DIGIT ZERO through * to U+0039 DIGIT NINE (i.e. just 0-9). * * When it comes to interpreting the number, * interpret it as a decimal number. */ //state = Transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos); state = TokenizerState.DECIMAL_NRC_LOOP; reconsume = true; // FALL THROUGH goto continueStateloop; break; } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER goto case TokenizerState.DECIMAL_NRC_LOOP; case TokenizerState.DECIMAL_NRC_LOOP: /*decimalloop:*/ for (; ; ) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; } // Deal with overflow gracefully if (value < prevValue) { value = 0x110000; // Value above Unicode range but // within int // range } prevValue = value; /* * Consume as many characters as match the range of * characters given above. */ if (c >= '0' && c <= '9') { seenDigits = true; value *= 10; value += c - '0'; continue; } else if (c == ';') { if (seenDigits) { //if ((returnState & DATA_AND_RCDATA_MASK) == 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) != 0) { cstart = pos + 1; } //state = Transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); state = TokenizerState.HANDLE_NCR_VALUE; // FALL THROUGH goto continueStateloop; goto breakDecimalloop; } else { ErrNoDigitsInNCR(); AppendStrBuf(';'); EmitOrAppendStrBuf(returnState); //if ((returnState & DATA_AND_RCDATA_MASK) == 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) != 0) { cstart = pos + 1; } //state = Transition(state, returnState, reconsume, pos); state = returnState; goto continueStateloop; } } else { /* * If no characters match the range, then don't * consume any characters (and unconsume the U+0023 * NUMBER SIGN character and, if appropriate, the X * character). This is a parse error; nothing is * returned. * * Otherwise, if the next character is a U+003B * SEMICOLON, consume that too. If it isn't, there * is a parse error. */ if (!seenDigits) { ErrNoDigitsInNCR(); EmitOrAppendStrBuf(returnState); //if ((returnState & DATA_AND_RCDATA_MASK) == 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) != 0) { cstart = pos; } //state = Transition(state, returnState, reconsume, pos); state = returnState; reconsume = true; goto continueStateloop; } else { ErrCharRefLacksSemicolon(); //if ((returnState & DATA_AND_RCDATA_MASK) == 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) != 0) { cstart = pos; } //state = Transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); state = TokenizerState.HANDLE_NCR_VALUE; reconsume = true; // FALL THROUGH goto continueStateloop; goto breakDecimalloop; } } } breakDecimalloop: goto case TokenizerState.HANDLE_NCR_VALUE; // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.HANDLE_NCR_VALUE: // WARNING previous state sets reconsume // XXX inline this case TokenizerState.if the method size can take it HandleNcrValue(returnState); //state = Transition(state, returnState, reconsume, pos); state = returnState; goto continueStateloop; // XXX reorder point case TokenizerState.HEX_NCR_LOOP: for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; // Deal with overflow gracefully if (value < prevValue) { value = 0x110000; // Value above Unicode range but // within int // range } prevValue = value; /* * Consume as many characters as match the range of * characters given above. */ if (c >= '0' && c <= '9') { seenDigits = true; value *= 16; value += c - '0'; continue; } else if (c >= 'A' && c <= 'F') { seenDigits = true; value *= 16; value += c - 'A' + 10; continue; } else if (c >= 'a' && c <= 'f') { seenDigits = true; value *= 16; value += c - 'a' + 10; continue; } else if (c == ';') { if (seenDigits) { //if ((returnState & DATA_AND_RCDATA_MASK) == 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) != 0) { cstart = pos + 1; } //state = Transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); state = TokenizerState.HANDLE_NCR_VALUE; goto continueStateloop; } else { ErrNoDigitsInNCR(); AppendStrBuf(';'); EmitOrAppendStrBuf(returnState); //if ((returnState & DATA_AND_RCDATA_MASK) == 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) != 0) { cstart = pos + 1; } //state = Transition(state, returnState, reconsume, pos); state = returnState; goto continueStateloop; } } else { /* * If no characters match the range, then don't * consume any characters (and unconsume the U+0023 * NUMBER SIGN character and, if appropriate, the X * character). This is a parse error; nothing is * returned. * * Otherwise, if the next character is a U+003B * SEMICOLON, consume that too. If it isn't, there * is a parse error. */ if (!seenDigits) { ErrNoDigitsInNCR(); EmitOrAppendStrBuf(returnState); //if ((returnState & DATA_AND_RCDATA_MASK) == 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) != 0) { cstart = pos; } //state = Transition(state, returnState, reconsume, pos); state = returnState; reconsume = true; goto continueStateloop; } else { ErrCharRefLacksSemicolon(); //if ((returnState & DATA_AND_RCDATA_MASK) == 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) != 0) { cstart = pos; } //state = Transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); state = TokenizerState.HANDLE_NCR_VALUE; reconsume = true; goto continueStateloop; } } } // XXX reorder point case TokenizerState.PLAINTEXT: /*plaintextloop:*/ for (; ; ) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; } switch (c) { case '\u0000': EmitPlaintextReplacementCharacter(buf, pos); continue; case '\r': EmitCarriageReturn(buf, pos); goto breakStateloop; case '\n': SilentLineFeed(); goto default; default: /* * Anything else Emit the current input * character as a character token. Stay in the * RAWTEXT state. */ continue; } } // XXX reorder point case TokenizerState.CLOSE_TAG_OPEN: if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Otherwise, if the content model flag is set to the PCDATA * state, or if the next few characters do match that tag * name, consume the next input character: */ switch (c) { case '>': /* U+003E GREATER-THAN SIGN (>) Parse error. */ ErrLtSlashGt(); /* * Switch to the data state. */ cstart = pos + 1; //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; case '\r': SilentCarriageReturn(); /* Anything else Parse error. */ ErrGarbageAfterLtSlash(); /* * Switch to the bogus comment state. */ ClearLongStrBufAndAppend('\n'); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.BOGUS_COMMENT; goto breakStateloop; case '\n': SilentLineFeed(); /* Anything else Parse error. */ ErrGarbageAfterLtSlash(); /* * Switch to the bogus comment state. */ ClearLongStrBufAndAppend('\n'); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.BOGUS_COMMENT; goto continueStateloop; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: if (c >= 'A' && c <= 'Z') { c += (char)0x20; } if (c >= 'a' && c <= 'z') { /* * U+0061 LATIN SMALL LETTER A through to U+007A * LATIN SMALL LETTER Z Create a new end tag * token, */ endTag = true; /* * set its tag name to the input character, */ ClearStrBufAndAppend(c); /* * then switch to the tag name state. (Don't * emit the token yet; further details will be * filled in before it is emitted.) */ //state = Transition(state, Tokenizer.TAG_NAME, reconsume, pos); state = TokenizerState.TAG_NAME; goto continueStateloop; } else { /* Anything else Parse error. */ ErrGarbageAfterLtSlash(); /* * Switch to the bogus comment state. */ ClearLongStrBufAndAppend(c); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.BOGUS_COMMENT; goto continueStateloop; } } // XXX reorder point case TokenizerState.RCDATA: /*rcdataloop:*/ for (; ; ) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; } switch (c) { case '&': /* * U+0026 AMPERSAND (&) Switch to the character * reference in RCDATA state. */ FlushChars(buf, pos); ClearStrBufAndAppend(c); additional = '\u0000'; returnState = state; //state = Transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); state = TokenizerState.CONSUME_CHARACTER_REFERENCE; goto continueStateloop; case '<': /* * U+003C LESS-THAN SIGN (<) Switch to the * RCDATA less-than sign state. */ FlushChars(buf, pos); returnState = state; //state = Transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); state = TokenizerState.RAWTEXT_RCDATA_LESS_THAN_SIGN; goto continueStateloop; case '\u0000': EmitReplacementCharacter(buf, pos); continue; case '\r': EmitCarriageReturn(buf, pos); goto breakStateloop; case '\n': SilentLineFeed(); goto default; default: /* * Emit the current input character as a * character token. Stay in the RCDATA state. */ continue; } } // XXX reorder point case TokenizerState.RAWTEXT: /*rawtextloop:*/ for (; ; ) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; } switch (c) { case '<': /* * U+003C LESS-THAN SIGN (<) Switch to the * RAWTEXT less-than sign state. */ FlushChars(buf, pos); returnState = state; //state = Transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); state = TokenizerState.RAWTEXT_RCDATA_LESS_THAN_SIGN; goto breakRawtextloop; // FALL THRU goto continueStateloop; case '\u0000': EmitReplacementCharacter(buf, pos); continue; case '\r': EmitCarriageReturn(buf, pos); goto breakStateloop; case '\n': SilentLineFeed(); goto default; default: /* * Emit the current input character as a * character token. Stay in the RAWTEXT state. */ continue; } } breakRawtextloop: goto case TokenizerState.RAWTEXT_RCDATA_LESS_THAN_SIGN; // XXX fallthru don't reorder case TokenizerState.RAWTEXT_RCDATA_LESS_THAN_SIGN: /*rawtextrcdatalessthansignloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; switch (c) { case '/': /* * U+002F SOLIDUS (/) Set the temporary buffer * to the empty string. Switch to the script * data end tag open state. */ index = 0; ClearStrBuf(); //state = Transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); state = TokenizerState.NON_DATA_END_TAG_NAME; goto breakRawtextrcdatalessthansignloop; // FALL THRU goto continueStateloop; default: /* * Otherwise, emit a U+003C LESS-THAN SIGN * character token */ TokenHandler.Characters(Tokenizer.LT_GT, 0, 1); /* * and reconsume the current input character in * the data state. */ cstart = pos; //state = Transition(state, returnState, reconsume, pos); state = returnState; reconsume = true; goto continueStateloop; } } breakRawtextrcdatalessthansignloop: goto case TokenizerState.NON_DATA_END_TAG_NAME; // XXX fall thru. don't reorder. case TokenizerState.NON_DATA_END_TAG_NAME: for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * ASSERT! when entering this state, set index to 0 and * call clearStrBuf() assert (contentModelElement != * null); Let's implement the above without lookahead. * strBuf is the 'temporary buffer'. */ if (index < endTagExpectationAsArray.Length) { char e = endTagExpectationAsArray[index]; char folded = c; if (c >= 'A' && c <= 'Z') { folded += (char)0x20; } if (folded != e) { // [NOCPP[ ErrHtml4LtSlashInRcdata(folded); // ]NOCPP] TokenHandler.Characters(Tokenizer.LT_SOLIDUS, 0, 2); EmitStrBuf(); cstart = pos; //state = Transition(state, returnState, reconsume, pos); state = returnState; reconsume = true; goto continueStateloop; } AppendStrBuf(c); index++; continue; } else { endTag = true; // XXX replace contentModelElement with different // type tagName = endTagExpectation; switch (c) { case '\r': SilentCarriageReturn(); //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.BEFORE_ATTRIBUTE_NAME; goto breakStateloop; case '\n': SilentLineFeed(); goto case ' '; // fall thru case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE * FEED (LF) U+000C FORM FEED (FF) U+0020 * SPACE If the current end tag token is an * appropriate end tag token, then switch to * the before attribute name state. */ //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.BEFORE_ATTRIBUTE_NAME; goto continueStateloop; case '/': /* * U+002F SOLIDUS (/) If the current end tag * token is an appropriate end tag token, * then switch to the self-closing start tag * state. */ //state = Transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); state = TokenizerState.SELF_CLOSING_START_TAG; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) If the * current end tag token is an appropriate * end tag token, then emit the current tag * token and switch to the data state. */ //state = Transition(state, EmitCurrentTagToken(false, pos), reconsume, pos); state = EmitCurrentTagToken(false, pos); if (shouldSuspend) { goto breakStateloop; } goto continueStateloop; default: /* * Emit a U+003C LESS-THAN SIGN character * token, a U+002F SOLIDUS character token, * a character token for each of the * characters in the temporary buffer (in * the order they were added to the buffer), * and reconsume the current input character * in the RAWTEXT state. */ // [NOCPP[ ErrWarnLtSlashInRcdata(); // ]NOCPP] TokenHandler.Characters(LT_SOLIDUS, 0, 2); EmitStrBuf(); if (c == '\u0000') { EmitReplacementCharacter(buf, pos); } else { cstart = pos; // don't drop the // character } //state = Transition(state, returnState, reconsume, pos); state = returnState; goto continueStateloop; } } } // XXX reorder point // BEGIN HOTSPOT WORKAROUND case TokenizerState.BOGUS_COMMENT: /*boguscommentloop:*/ for (; ; ) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; } /* * Consume every character up to and including the first * U+003E GREATER-THAN SIGN character (>) or the end of * the file (EOF), whichever comes first. Emit a comment * token whose data is the concatenation of all the * characters starting from and including the character * that caused the state machine to switch into the * bogus comment state, up to and including the * character immediately before the last consumed * character (i.e. up to the character just before the * U+003E or EOF character). (If the comment was started * by the end of the file (EOF), the token is empty.) * * Switch to the data state. * * If the end of the file was reached, reconsume the EOF * character. */ switch (c) { case '>': EmitComment(0, pos); //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; case '-': AppendLongStrBuf(c); //state = Transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos); state = TokenizerState.BOGUS_COMMENT_HYPHEN; goto breakBoguscommentloop; case '\r': AppendLongStrBufCarriageReturn(); goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); continue; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: AppendLongStrBuf(c); continue; } } breakBoguscommentloop: goto case TokenizerState.BOGUS_COMMENT_HYPHEN; // FALLTHRU DON'T REORDER case TokenizerState.BOGUS_COMMENT_HYPHEN: /*boguscommenthyphenloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; switch (c) { case '>': // [NOCPP[ MaybeAppendSpaceToBogusComment(); // ]NOCPP] EmitComment(0, pos); //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; case '-': AppendSecondHyphenToBogusComment(); goto continueBoguscommenthyphenloop; case '\r': AppendLongStrBufCarriageReturn(); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.BOGUS_COMMENT; goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.BOGUS_COMMENT; goto continueStateloop; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: AppendLongStrBuf(c); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.BOGUS_COMMENT; goto continueStateloop; } continueBoguscommenthyphenloop: continue; } // XXX reorder point case TokenizerState.SCRIPT_DATA: /*scriptdataloop:*/ for (; ; ) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; } switch (c) { case '<': /* * U+003C LESS-THAN SIGN (<) Switch to the * script data less-than sign state. */ FlushChars(buf, pos); returnState = state; //state = Transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos); state = TokenizerState.SCRIPT_DATA_LESS_THAN_SIGN; goto breakScriptdataloop; // FALL THRU continue // stateloop; case '\u0000': EmitReplacementCharacter(buf, pos); continue; case '\r': EmitCarriageReturn(buf, pos); goto breakStateloop; case '\n': SilentLineFeed(); goto default; default: /* * Anything else Emit the current input * character as a character token. Stay in the * script data state. */ continue; } } breakScriptdataloop: goto case TokenizerState.SCRIPT_DATA_LESS_THAN_SIGN; // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.SCRIPT_DATA_LESS_THAN_SIGN: /*scriptdatalessthansignloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; switch (c) { case '/': /* * U+002F SOLIDUS (/) Set the temporary buffer * to the empty string. Switch to the script * data end tag open state. */ index = 0; ClearStrBuf(); //state = Transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); state = TokenizerState.NON_DATA_END_TAG_NAME; goto continueStateloop; case '!': TokenHandler.Characters(LT_GT, 0, 1); cstart = pos; //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos); state = TokenizerState.SCRIPT_DATA_ESCAPE_START; goto breakScriptdatalessthansignloop; // FALL THRU // continue // stateloop; default: /* * Otherwise, emit a U+003C LESS-THAN SIGN * character token */ TokenHandler.Characters(LT_GT, 0, 1); /* * and reconsume the current input character in * the data state. */ cstart = pos; //state = Transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); state = TokenizerState.SCRIPT_DATA; reconsume = true; goto continueStateloop; } } breakScriptdatalessthansignloop: goto case TokenizerState.SCRIPT_DATA_ESCAPE_START; // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.SCRIPT_DATA_ESCAPE_START: /*scriptdataescapestartloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Emit a U+002D * HYPHEN-MINUS character token. Switch to the * script data escape start dash state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos); state = TokenizerState.SCRIPT_DATA_ESCAPE_START_DASH; goto breakScriptdataescapestartloop; // FALL THRU // continue // stateloop; default: /* * Anything else Reconsume the current input * character in the script data state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); state = TokenizerState.SCRIPT_DATA; reconsume = true; goto continueStateloop; } } breakScriptdataescapestartloop: goto case TokenizerState.SCRIPT_DATA_ESCAPE_START_DASH; // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.SCRIPT_DATA_ESCAPE_START_DASH: /*scriptdataescapestartdashloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Emit a U+002D * HYPHEN-MINUS character token. Switch to the * script data escaped dash dash state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); state = TokenizerState.SCRIPT_DATA_ESCAPED_DASH_DASH; goto breakScriptdataescapestartdashloop; // goto continueStateloop; default: /* * Anything else Reconsume the current input * character in the script data state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); state = TokenizerState.SCRIPT_DATA; reconsume = true; goto continueStateloop; } } breakScriptdataescapestartdashloop: goto case TokenizerState.SCRIPT_DATA_ESCAPED_DASH_DASH; // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.SCRIPT_DATA_ESCAPED_DASH_DASH: /*scriptdataescapeddashdashloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Emit a U+002D * HYPHEN-MINUS character token. Stay in the * script data escaped dash dash state. */ continue; case '<': /* * U+003C LESS-THAN SIGN (<) Switch to the * script data escaped less-than sign state. */ FlushChars(buf, pos); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); state = TokenizerState.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit a U+003E * GREATER-THAN SIGN character token. Switch to * the script data state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); state = TokenizerState.SCRIPT_DATA; goto continueStateloop; case '\u0000': EmitReplacementCharacter(buf, pos); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_ESCAPED; goto breakScriptdataescapeddashdashloop; case '\r': EmitCarriageReturn(buf, pos); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_ESCAPED; goto breakStateloop; case '\n': SilentLineFeed(); goto default; default: /* * Anything else Emit the current input * character as a character token. Switch to the * script data escaped state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_ESCAPED; goto breakScriptdataescapeddashdashloop; // goto continueStateloop; } } breakScriptdataescapeddashdashloop: goto case TokenizerState.SCRIPT_DATA_ESCAPED; // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.SCRIPT_DATA_ESCAPED: /*scriptdataescapedloop:*/ for (; ; ) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; } /* * Consume the next input character: */ switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Emit a U+002D * HYPHEN-MINUS character token. Switch to the * script data escaped dash state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos); state = TokenizerState.SCRIPT_DATA_ESCAPED_DASH; goto breakScriptdataescapedloop; // FALL THRU // continue // stateloop; case '<': /* * U+003C LESS-THAN SIGN (<) Switch to the * script data escaped less-than sign state. */ FlushChars(buf, pos); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); state = TokenizerState.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; goto continueStateloop; case '\u0000': EmitReplacementCharacter(buf, pos); continue; case '\r': EmitCarriageReturn(buf, pos); goto breakStateloop; case '\n': SilentLineFeed(); goto default; default: /* * Anything else Emit the current input * character as a character token. Stay in the * script data escaped state. */ continue; } } breakScriptdataescapedloop: goto case TokenizerState.SCRIPT_DATA_ESCAPED_DASH; // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.SCRIPT_DATA_ESCAPED_DASH: /*scriptdataescapeddashloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Emit a U+002D * HYPHEN-MINUS character token. Switch to the * script data escaped dash dash state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); state = TokenizerState.SCRIPT_DATA_ESCAPED_DASH_DASH; goto continueStateloop; case '<': /* * U+003C LESS-THAN SIGN (<) Switch to the * script data escaped less-than sign state. */ FlushChars(buf, pos); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); state = TokenizerState.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; goto breakScriptdataescapeddashloop; // goto continueStateloop; case '\u0000': EmitReplacementCharacter(buf, pos); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_ESCAPED; goto continueStateloop; case '\r': EmitCarriageReturn(buf, pos); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_ESCAPED; goto breakStateloop; case '\n': SilentLineFeed(); goto default; default: /* * Anything else Emit the current input * character as a character token. Switch to the * script data escaped state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_ESCAPED; goto continueStateloop; } } breakScriptdataescapeddashloop: goto case TokenizerState.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: /*scriptdataescapedlessthanloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '/': /* * U+002F SOLIDUS (/) Set the temporary buffer * to the empty string. Switch to the script * data escaped end tag open state. */ index = 0; ClearStrBuf(); returnState = TokenizerState.SCRIPT_DATA_ESCAPED; //state = Transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); state = TokenizerState.NON_DATA_END_TAG_NAME; goto continueStateloop; case 'S': case 's': /* * U+0041 LATIN CAPITAL LETTER A through to * U+005A LATIN CAPITAL LETTER Z Emit a U+003C * LESS-THAN SIGN character token and the * current input character as a character token. */ TokenHandler.Characters(LT_GT, 0, 1); cstart = pos; index = 1; /* * Set the temporary buffer to the empty string. * Append the lowercase TokenizerState.version of the current * input character (add 0x0020 to the * character's code point) to the temporary * buffer. Switch to the script data double * escape start state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos); state = TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPE_START; goto breakScriptdataescapedlessthanloop; // goto continueStateloop; default: /* * Anything else Emit a U+003C LESS-THAN SIGN * character token and reconsume the current * input character in the script data escaped * state. */ TokenHandler.Characters(LT_GT, 0, 1); cstart = pos; reconsume = true; //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_ESCAPED; goto continueStateloop; } } breakScriptdataescapedlessthanloop: goto case TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPE_START; // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPE_START: /*scriptdatadoubleescapestartloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; Debug.Assert(index > 0); if (index < 6) { // SCRIPT_ARR.Length char folded = c; if (c >= 'A' && c <= 'Z') { folded += (char)0x20; } if (folded != Tokenizer.SCRIPT_ARR[index]) { reconsume = true; //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_ESCAPED; goto continueStateloop; } index++; continue; } switch (c) { case '\r': EmitCarriageReturn(buf, pos); //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED; goto breakStateloop; case '\n': SilentLineFeed(); goto case ' '; case ' ': case '\t': case '\u000C': case '/': case '>': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN * (>) Emit the current input character as a * character token. If the temporary buffer is * the string "script", then switch to the * script data double escaped state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED; goto breakScriptdatadoubleescapestartloop; // goto continueStateloop; default: /* * Anything else Reconsume the current input * character in the script data escaped state. */ reconsume = true; //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_ESCAPED; goto continueStateloop; } } breakScriptdatadoubleescapestartloop: goto case TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED; // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED: /*scriptdatadoubleescapedloop:*/ for (; ; ) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; } /* * Consume the next input character: */ switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Emit a U+002D * HYPHEN-MINUS character token. Switch to the * script data double escaped dash state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos); state = TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED_DASH; goto breakScriptdatadoubleescapedloop; // FALL THRU // continue // stateloop; case '<': /* * U+003C LESS-THAN SIGN (<) Emit a U+003C * LESS-THAN SIGN character token. Switch to the * script data double escaped less-than sign * state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); state = TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; goto continueStateloop; case '\u0000': EmitReplacementCharacter(buf, pos); continue; case '\r': EmitCarriageReturn(buf, pos); goto breakStateloop; case '\n': SilentLineFeed(); goto default; default: /* * Anything else Emit the current input * character as a character token. Stay in the * script data double escaped state. */ continue; } } breakScriptdatadoubleescapedloop: goto case TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED_DASH; // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED_DASH: /*scriptdatadoubleescapeddashloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Emit a U+002D * HYPHEN-MINUS character token. Switch to the * script data double escaped dash dash state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos); state = TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH; goto breakScriptdatadoubleescapeddashloop; // goto continueStateloop; case '<': /* * U+003C LESS-THAN SIGN (<) Emit a U+003C * LESS-THAN SIGN character token. Switch to the * script data double escaped less-than sign * state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); state = TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; goto continueStateloop; case '\u0000': EmitReplacementCharacter(buf, pos); //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED; goto continueStateloop; case '\r': EmitCarriageReturn(buf, pos); //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED; goto breakStateloop; case '\n': SilentLineFeed(); goto default; default: /* * Anything else Emit the current input * character as a character token. Switch to the * script data double escaped state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED; goto continueStateloop; } } breakScriptdatadoubleescapeddashloop: goto case TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH; // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: /*scriptdatadoubleescapeddashdashloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Emit a U+002D * HYPHEN-MINUS character token. Stay in the * script data double escaped dash dash state. */ continue; case '<': /* * U+003C LESS-THAN SIGN (<) Emit a U+003C * LESS-THAN SIGN character token. Switch to the * script data double escaped less-than sign * state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); state = TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; goto breakScriptdatadoubleescapeddashdashloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit a U+003E * GREATER-THAN SIGN character token. Switch to * the script data state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); state = TokenizerState.SCRIPT_DATA; goto continueStateloop; case '\u0000': EmitReplacementCharacter(buf, pos); //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED; goto continueStateloop; case '\r': EmitCarriageReturn(buf, pos); //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED; goto breakStateloop; case '\n': SilentLineFeed(); goto default; default: /* * Anything else Emit the current input * character as a character token. Switch to the * script data double escaped state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED; goto continueStateloop; } } breakScriptdatadoubleescapeddashdashloop: goto case TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: /*scriptdatadoubleescapedlessthanloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '/': /* * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS * character token. Set the temporary buffer to * the empty string. Switch to the script data * double escape end state. */ index = 0; //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos); state = TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPE_END; goto breakScriptdatadoubleescapedlessthanloop; default: /* * Anything else Reconsume the current input * character in the script data double escaped * state. */ reconsume = true; //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED; goto continueStateloop; } } breakScriptdatadoubleescapedlessthanloop: goto case TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPE_END; // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPE_END: /*scriptdatadoubleescapeendloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; if (index < 6) { // SCRIPT_ARR.Length char folded = c; if (c >= 'A' && c <= 'Z') { folded += (char)0x20; } if (folded != Tokenizer.SCRIPT_ARR[index]) { reconsume = true; //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED; goto continueStateloop; } index++; continue; } switch (c) { case '\r': EmitCarriageReturn(buf, pos); //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_ESCAPED; goto breakStateloop; case '\n': SilentLineFeed(); goto case ' '; case ' ': case '\t': case '\u000C': case '/': case '>': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN * (>) Emit the current input character as a * character token. If the temporary buffer is * the string "script", then switch to the * script data escaped state. */ //state = Transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_ESCAPED; goto continueStateloop; default: /* * Reconsume the current input character in the * script data double escaped state. */ reconsume = true; //state = Transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); state = TokenizerState.SCRIPT_DATA_DOUBLE_ESCAPED; goto continueStateloop; } } // XXX reorder point case TokenizerState.MARKUP_DECLARATION_OCTYPE: /*markupdeclarationdoctypeloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; if (index < 6) { // OCTYPE.Length char folded = c; if (c >= 'A' && c <= 'Z') { folded += (char)0x20; } if (folded == Tokenizer.OCTYPE[index]) { AppendLongStrBuf(c); } else { ErrBogusComment(); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.BOGUS_COMMENT; reconsume = true; goto continueStateloop; } index++; continue; } else { // state = Transition(state, Tokenizer.DOCTYPE, reconsume, pos); state = TokenizerState.DOCTYPE; reconsume = true; goto breakMarkupdeclarationdoctypeloop; // goto continueStateloop; } } breakMarkupdeclarationdoctypeloop: goto case TokenizerState.DOCTYPE; // FALLTHRU DON'T REORDER case TokenizerState.DOCTYPE: /*doctypeloop:*/ for (; ; ) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; } InitDoctypeFields(); /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); //state = Transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); state = TokenizerState.BEFORE_DOCTYPE_NAME; goto breakStateloop; case '\n': SilentLineFeed(); goto case ' '; // fall thru case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * Switch to the before DOCTYPE name state. */ //state = Transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); state = TokenizerState.BEFORE_DOCTYPE_NAME; goto breakDoctypeloop; // goto continueStateloop; default: /* * Anything else Parse error. */ ErrMissingSpaceBeforeDoctypeName(); /* * Reconsume the current character in the before * DOCTYPE name state. */ //state = Transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); state = TokenizerState.BEFORE_DOCTYPE_NAME; reconsume = true; goto breakDoctypeloop; // goto continueStateloop; } } breakDoctypeloop: goto case TokenizerState.BEFORE_DOCTYPE_NAME; // FALLTHRU DON'T REORDER case TokenizerState.BEFORE_DOCTYPE_NAME: /*beforedoctypenameloop:*/ for (; ; ) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; } /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': SilentLineFeed(); // fall thru goto case ' '; case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay * in the before DOCTYPE name state. */ continue; case '>': /* * U+003E GREATER-THAN SIGN (>) Parse error. */ ErrNamelessDoctype(); /* * Create a new DOCTYPE token. Set its * force-quirks flag to on. */ forceQuirks = true; /* * Emit the token. */ EmitDoctypeToken(pos); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: if (c >= 'A' && c <= 'Z') { /* * U+0041 LATIN CAPITAL LETTER A through to * U+005A LATIN CAPITAL LETTER Z Create a * new DOCTYPE token. Set the token's name * to the lowercase TokenizerState.version of the input * character (add 0x0020 to the character's * code point). */ c += (char)0x20; } /* Anything else Create a new DOCTYPE token. */ /* * Set the token's name name to the current * input character. */ ClearStrBufAndAppend(c); /* * Switch to the DOCTYPE name state. */ //state = Transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos); state = TokenizerState.DOCTYPE_NAME; goto breakBeforedoctypenameloop; // goto continueStateloop; } } breakBeforedoctypenameloop: goto case TokenizerState.DOCTYPE_NAME; // FALLTHRU DON'T REORDER case TokenizerState.DOCTYPE_NAME: /*doctypenameloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); StrBufToDoctypeName(); //state = Transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); state = TokenizerState.AFTER_DOCTYPE_NAME; goto breakStateloop; case '\n': SilentLineFeed(); // fall thru goto case ' '; case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * Switch to the after DOCTYPE name state. */ StrBufToDoctypeName(); //state = Transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); state = TokenizerState.AFTER_DOCTYPE_NAME; goto breakDoctypenameloop; // goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * DOCTYPE token. */ StrBufToDoctypeName(); EmitDoctypeToken(pos); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * U+0041 LATIN CAPITAL LETTER A through to * U+005A LATIN CAPITAL LETTER Z Append the * lowercase TokenizerState.version of the input character (add * 0x0020 to the character's code point) to the * current DOCTYPE token's name. */ if (c >= 'A' && c <= 'Z') { c += (char)0x0020; } /* * Anything else Append the current input * character to the current DOCTYPE token's * name. */ AppendStrBuf(c); /* * Stay in the DOCTYPE name state. */ continue; } } breakDoctypenameloop: goto case TokenizerState.AFTER_DOCTYPE_NAME; // FALLTHRU DON'T REORDER case TokenizerState.AFTER_DOCTYPE_NAME: /*afterdoctypenameloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': SilentLineFeed(); // fall thru goto case ' '; case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay * in the after DOCTYPE name state. */ continue; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * DOCTYPE token. */ EmitDoctypeToken(pos); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; case 'p': case 'P': index = 0; //state = Transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos); state = TokenizerState.DOCTYPE_UBLIC; goto breakAfterdoctypenameloop; // goto continueStateloop; case 's': case 'S': index = 0; //state = Transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos); state = TokenizerState.DOCTYPE_YSTEM; goto continueStateloop; default: /* * Otherwise, this is the parse error. */ BogusDoctype(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ // done by bogusDoctype(); /* * Switch to the bogus DOCTYPE state. */ //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.BOGUS_DOCTYPE; goto continueStateloop; } } breakAfterdoctypenameloop: goto case TokenizerState.DOCTYPE_UBLIC; // FALLTHRU DON'T REORDER case TokenizerState.DOCTYPE_UBLIC: /*doctypeublicloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * If the six characters starting from the current input * character are an ASCII case-insensitive match for the * word "PUBLIC", then consume those characters and * switch to the before DOCTYPE public identifier state. */ if (index < 5) { // UBLIC.Length char folded = c; if (c >= 'A' && c <= 'Z') { folded += (char)0x20; } if (folded != Tokenizer.UBLIC[index]) { BogusDoctype(); // forceQuirks = true; //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.BOGUS_DOCTYPE; reconsume = true; goto continueStateloop; } index++; continue; } else { //state = Transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos); state = TokenizerState.AFTER_DOCTYPE_PUBLIC_KEYWORD; reconsume = true; goto breakDoctypeublicloop; // goto continueStateloop; } } breakDoctypeublicloop: goto case TokenizerState.AFTER_DOCTYPE_PUBLIC_KEYWORD; // FALLTHRU DON'T REORDER case TokenizerState.AFTER_DOCTYPE_PUBLIC_KEYWORD: /*afterdoctypepublickeywordloop:*/ for (; ; ) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; } /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); //state = Transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); state = TokenizerState.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER; goto breakStateloop; case '\n': SilentLineFeed(); // fall thru goto case ' '; case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * Switch to the before DOCTYPE public * identifier state. */ //state = Transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); state = TokenizerState.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER; goto breakAfterdoctypepublickeywordloop; // FALL THROUGH continue stateloop case '"': /* * U+0022 QUOTATION MARK (") Parse Error. */ ErrNoSpaceBetweenDoctypePublicKeywordAndQuote(); /* * Set the DOCTYPE token's public identifier to * the empty string (not missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE public identifier * (double-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); state = TokenizerState.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED; goto continueStateloop; case '\'': /* * U+0027 APOSTROPHE (') Parse Error. */ ErrNoSpaceBetweenDoctypePublicKeywordAndQuote(); /* * Set the DOCTYPE token's public identifier to * the empty string (not missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE public identifier * (single-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); state = TokenizerState.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED; goto continueStateloop; case '>': /* U+003E GREATER-THAN SIGN (>) Parse error. */ ErrExpectedPublicId(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ forceQuirks = true; /* * Emit that DOCTYPE token. */ EmitDoctypeToken(pos); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; default: BogusDoctype(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ // done by bogusDoctype(); /* * Switch to the bogus DOCTYPE state. */ //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.BOGUS_DOCTYPE; goto continueStateloop; } } breakAfterdoctypepublickeywordloop: goto case TokenizerState.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER; // FALLTHRU DON'T REORDER case TokenizerState.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: /*beforedoctypepublicidentifierloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': SilentLineFeed(); // fall thru goto case ' '; case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay * in the before DOCTYPE public identifier * state. */ continue; case '"': /* * U+0022 QUOTATION MARK (") Set the DOCTYPE * token's public identifier to the empty string * (not missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE public identifier * (double-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); state = TokenizerState.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED; goto breakBeforedoctypepublicidentifierloop; // goto continueStateloop; case '\'': /* * U+0027 APOSTROPHE (') Set the DOCTYPE token's * public identifier to the empty string (not * missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE public identifier * (single-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); state = TokenizerState.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED; goto continueStateloop; case '>': /* U+003E GREATER-THAN SIGN (>) Parse error. */ ErrExpectedPublicId(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ forceQuirks = true; /* * Emit that DOCTYPE token. */ EmitDoctypeToken(pos); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; default: BogusDoctype(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ // done by bogusDoctype(); /* * Switch to the bogus DOCTYPE state. */ //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.BOGUS_DOCTYPE; goto continueStateloop; } } breakBeforedoctypepublicidentifierloop: goto case TokenizerState.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED; // FALLTHRU DON'T REORDER case TokenizerState.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: /*doctypepublicidentifierdoublequotedloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '"': /* * U+0022 QUOTATION MARK (") Switch to the after * DOCTYPE public identifier state. */ publicIdentifier = LongStrBufToString(); //state = Transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); state = TokenizerState.AFTER_DOCTYPE_PUBLIC_IDENTIFIER; goto breakDoctypepublicidentifierdoublequotedloop; // goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Parse error. */ ErrGtInPublicId(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ forceQuirks = true; /* * Emit that DOCTYPE token. */ publicIdentifier = LongStrBufToString(); EmitDoctypeToken(pos); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); continue; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Anything else Append the current input * character to the current DOCTYPE token's * public identifier. */ AppendLongStrBuf(c); /* * Stay in the DOCTYPE public identifier * (double-quoted) state. */ continue; } } breakDoctypepublicidentifierdoublequotedloop: goto case TokenizerState.AFTER_DOCTYPE_PUBLIC_IDENTIFIER; // FALLTHRU DON'T REORDER case TokenizerState.AFTER_DOCTYPE_PUBLIC_IDENTIFIER: /*afterdoctypepublicidentifierloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); //state = Transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); state = TokenizerState.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS; goto breakStateloop; case '\n': SilentLineFeed(); // fall thru goto case ' '; case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * Switch to the between DOCTYPE public and * system identifiers state. */ //state = Transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); state = TokenizerState.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS; goto breakAfterdoctypepublicidentifierloop; // goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * DOCTYPE token. */ EmitDoctypeToken(pos); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; case '"': /* * U+0022 QUOTATION MARK (") Parse error. */ ErrNoSpaceBetweenPublicAndSystemIds(); /* * Set the DOCTYPE token's system identifier to * the empty string (not missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE system identifier * (double-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); state = TokenizerState.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; goto continueStateloop; case '\'': /* * U+0027 APOSTROPHE (') Parse error. */ ErrNoSpaceBetweenPublicAndSystemIds(); /* * Set the DOCTYPE token's system identifier to * the empty string (not missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE system identifier * (single-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); state = TokenizerState.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; goto continueStateloop; default: BogusDoctype(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ // done by bogusDoctype(); /* * Switch to the bogus DOCTYPE state. */ //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.BOGUS_DOCTYPE; goto continueStateloop; } } breakAfterdoctypepublicidentifierloop: goto case TokenizerState.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS; // FALLTHRU DON'T REORDER case TokenizerState.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: /*betweendoctypepublicandsystemidentifiersloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': SilentLineFeed(); goto case ' '; // fall thru case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay * in the between DOCTYPE public and system * identifiers state. */ continue; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * DOCTYPE token. */ EmitDoctypeToken(pos); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; case '"': /* * U+0022 QUOTATION MARK (") Set the DOCTYPE * token's system identifier to the empty string * (not missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE system identifier * (double-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); state = TokenizerState.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; goto breakBetweendoctypepublicandsystemidentifiersloop; // goto continueStateloop; case '\'': /* * U+0027 APOSTROPHE (') Set the DOCTYPE token's * system identifier to the empty string (not * missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE system identifier * (single-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); state = TokenizerState.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; goto continueStateloop; default: BogusDoctype(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ // done by bogusDoctype(); /* * Switch to the bogus DOCTYPE state. */ //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.BOGUS_DOCTYPE; goto continueStateloop; } } breakBetweendoctypepublicandsystemidentifiersloop: goto case TokenizerState.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; // FALLTHRU DON'T REORDER case TokenizerState.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: /*doctypesystemidentifierdoublequotedloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '"': /* * U+0022 QUOTATION MARK (") Switch to the after * DOCTYPE system identifier state. */ systemIdentifier = LongStrBufToString(); //state = Transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); state = TokenizerState.AFTER_DOCTYPE_SYSTEM_IDENTIFIER; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Parse error. */ ErrGtInSystemId(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ forceQuirks = true; /* * Emit that DOCTYPE token. */ systemIdentifier = LongStrBufToString(); EmitDoctypeToken(pos); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); continue; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Anything else Append the current input * character to the current DOCTYPE token's * system identifier. */ AppendLongStrBuf(c); /* * Stay in the DOCTYPE system identifier * (double-quoted) state. */ continue; } } // next 2 lines were unreachable; commented out //breakDoctypesystemidentifierdoublequotedloop: // goto case TokenizerState.AFTER_DOCTYPE_SYSTEM_IDENTIFIER; // FALLTHRU DON'T REORDER case TokenizerState.AFTER_DOCTYPE_SYSTEM_IDENTIFIER: /*afterdoctypesystemidentifierloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': SilentLineFeed(); goto case ' '; // fall thru case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay * in the after DOCTYPE system identifier state. */ continue; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * DOCTYPE token. */ EmitDoctypeToken(pos); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; default: /* * Switch to the bogus DOCTYPE state. (This does * not set the DOCTYPE token's force-quirks flag * to on.) */ BogusDoctypeWithoutQuirks(); //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.BOGUS_DOCTYPE; goto breakAfterdoctypesystemidentifierloop; // goto continueStateloop; } } breakAfterdoctypesystemidentifierloop: goto case TokenizerState.BOGUS_DOCTYPE; // FALLTHRU DON'T REORDER case TokenizerState.BOGUS_DOCTYPE: for (; ; ) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; } /* * Consume the next input character: */ switch (c) { case '>': /* * U+003E GREATER-THAN SIGN (>) Emit that * DOCTYPE token. */ EmitDoctypeToken(pos); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': SilentLineFeed(); // fall thru goto default; default: /* * Anything else Stay in the bogus DOCTYPE * state. */ continue; } } // XXX reorder point case TokenizerState.DOCTYPE_YSTEM: /*doctypeystemloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Otherwise, if the six characters starting from the * current input character are an ASCII case-insensitive * match for the word "SYSTEM", then consume those * characters and switch to the before DOCTYPE system * identifier state. */ if (index < 5) { // YSTEM.Length char folded = c; if (c >= 'A' && c <= 'Z') { folded += (char)0x20; } if (folded != YSTEM[index]) { BogusDoctype(); //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.BOGUS_DOCTYPE; reconsume = true; goto continueStateloop; } index++; goto continueStateloop; } else { //state = Transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos); state = TokenizerState.AFTER_DOCTYPE_SYSTEM_KEYWORD; reconsume = true; goto breakDoctypeystemloop; // goto continueStateloop; } } breakDoctypeystemloop: goto case TokenizerState.AFTER_DOCTYPE_SYSTEM_KEYWORD; // FALLTHRU DON'T REORDER case TokenizerState.AFTER_DOCTYPE_SYSTEM_KEYWORD: /*afterdoctypesystemkeywordloop:*/ for (; ; ) { if (reconsume) { reconsume = false; } else { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; } /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); //state = Transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); state = TokenizerState.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER; goto breakStateloop; case '\n': SilentLineFeed(); // fall thru goto case ' '; case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * Switch to the before DOCTYPE public * identifier state. */ //state = Transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); state = TokenizerState.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER; goto breakAfterdoctypesystemkeywordloop; // FALL THROUGH continue stateloop case '"': /* * U+0022 QUOTATION MARK (") Parse Error. */ ErrNoSpaceBetweenDoctypeSystemKeywordAndQuote(); /* * Set the DOCTYPE token's system identifier to * the empty string (not missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE public identifier * (double-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); state = TokenizerState.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; goto continueStateloop; case '\'': /* * U+0027 APOSTROPHE (') Parse Error. */ ErrNoSpaceBetweenDoctypeSystemKeywordAndQuote(); /* * Set the DOCTYPE token's public identifier to * the empty string (not missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE public identifier * (single-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); state = TokenizerState.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; goto continueStateloop; case '>': /* U+003E GREATER-THAN SIGN (>) Parse error. */ ErrExpectedPublicId(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ forceQuirks = true; /* * Emit that DOCTYPE token. */ EmitDoctypeToken(pos); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; default: BogusDoctype(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ // done by bogusDoctype(); /* * Switch to the bogus DOCTYPE state. */ //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.BOGUS_DOCTYPE; goto continueStateloop; } } breakAfterdoctypesystemkeywordloop: goto case TokenizerState.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER; // FALLTHRU DON'T REORDER case TokenizerState.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: /*beforedoctypesystemidentifierloop:*/ for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': SilentLineFeed(); // fall thru goto case ' '; case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay * in the before DOCTYPE system identifier * state. */ continue; case '"': /* * U+0022 QUOTATION MARK (") Set the DOCTYPE * token's system identifier to the empty string * (not missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE system identifier * (double-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); state = TokenizerState.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; goto continueStateloop; case '\'': /* * U+0027 APOSTROPHE (') Set the DOCTYPE token's * system identifier to the empty string (not * missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE system identifier * (single-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); state = TokenizerState.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; goto breakBeforedoctypesystemidentifierloop; // goto continueStateloop; case '>': /* U+003E GREATER-THAN SIGN (>) Parse error. */ ErrExpectedSystemId(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ forceQuirks = true; /* * Emit that DOCTYPE token. */ EmitDoctypeToken(pos); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; default: BogusDoctype(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ // done by bogusDoctype(); /* * Switch to the bogus DOCTYPE state. */ //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.BOGUS_DOCTYPE; goto continueStateloop; } } breakBeforedoctypesystemidentifierloop: goto case TokenizerState.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; // FALLTHRU DON'T REORDER case TokenizerState.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '\'': /* * U+0027 APOSTROPHE (') Switch to the after * DOCTYPE system identifier state. */ systemIdentifier = LongStrBufToString(); //state = Transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); state = TokenizerState.AFTER_DOCTYPE_SYSTEM_IDENTIFIER; goto continueStateloop; case '>': ErrGtInSystemId(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ forceQuirks = true; /* * Emit that DOCTYPE token. */ systemIdentifier = LongStrBufToString(); EmitDoctypeToken(pos); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); continue; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Anything else Append the current input * character to the current DOCTYPE token's * system identifier. */ AppendLongStrBuf(c); /* * Stay in the DOCTYPE system identifier * (double-quoted) state. */ continue; } } // XXX reorder point case TokenizerState.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: for (; ; ) { if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; /* * Consume the next input character: */ switch (c) { case '\'': /* * U+0027 APOSTROPHE (') Switch to the after * DOCTYPE public identifier state. */ publicIdentifier = LongStrBufToString(); //state = Transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); state = TokenizerState.AFTER_DOCTYPE_PUBLIC_IDENTIFIER; goto continueStateloop; case '>': ErrGtInPublicId(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ forceQuirks = true; /* * Emit that DOCTYPE token. */ publicIdentifier = LongStrBufToString(); EmitDoctypeToken(pos); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.DATA; goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); continue; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Anything else Append the current input * character to the current DOCTYPE token's * public identifier. */ AppendLongStrBuf(c); /* * Stay in the DOCTYPE public identifier * (single-quoted) state. */ continue; } } // XXX reorder point case TokenizerState.PROCESSING_INSTRUCTION: //processinginstructionloop: for (;;) { if (++pos == endPos) { break; } c = buf[pos]; switch (c) { case '?': //state = Transition(state,Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK,reconsume, pos); state = TokenizerState.PROCESSING_INSTRUCTION_QUESTION_MARK; break; // continue stateloop; default: continue; } } //breakProcessingInstructionLoop: break; case TokenizerState.PROCESSING_INSTRUCTION_QUESTION_MARK: if (++pos == endPos) { goto breakStateloop; } c = buf[pos]; switch (c) { case '>': //state = Transition(state, Tokenizer.DATA,reconsume, pos); state = TokenizerState.DATA; continue; default: //state = Transition(state,Tokenizer.PROCESSING_INSTRUCTION,reconsume, pos); state = TokenizerState.PROCESSING_INSTRUCTION; continue; } // END HOTSPOT WORKAROUND } } // stateloop breakStateloop: FlushChars(buf, pos); /* * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; } */ // Save locals stateSave = state; returnStateSave = returnState; return pos; }
/// <summary> /// Returns a literal object for the text between HtmlStart (the last position of the end of a /// tag) and the current position. If !AllowLiterals then it's wrapped in a span. /// </summary> /// /// <param name="factory"> /// The HTML factory to operate against /// </param> /// <param name="literal"> /// [out] The literal. /// </param> /// /// <returns> /// true if it succeeds, false if it fails. /// </returns> public bool TryGetLiteral(HtmlElementFactory factory, out IDomObject literal) { if (Pos <= HtmlStart) { literal = null; return false; } // There's plain text -return it as a literal. DomText lit; switch(InsertionMode) { case InsertionMode.Invalid: lit = new DomInvalidElement(); break; case InsertionMode.Text: InsertionMode =InsertionMode.Default; lit = new DomInnerText(); break; default: lit = new DomText(); break; } literal = lit; if (factory.IsBound) { lit.SetTextIndex(factory.Document, factory.Document.DocumentIndex.TokenizeString(HtmlStart, Pos - HtmlStart)); } else { string text = factory.Html.SubstringBetween(HtmlStart, Pos); literal.NodeValue = HtmlData.HtmlDecode(text); } if (WrapLiterals) { DomElement wrapper = DomElement.Create("span"); wrapper.ChildNodesInternal.AddAlways(literal); literal = wrapper; } if (Parent != null) { ((DomElement)Parent.Element).ChildNodesInternal.AddAlways(literal); Reset(); return false; } else { TokenizerState = TokenizerState.Finished; return true; } }
void EmitOrAppendTwo(char[] val, TokenizerState returnState) { //TODO: review here=> use != or == ? //if ((returnState & DATA_AND_RCDATA_MASK) != 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) == 0) { AppendLongStrBuf(val[0]); AppendLongStrBuf(val[1]); } else { TokenListener.Characters(val, 0, 2); } }
private TokenizerState EmitCurrentTagToken(bool selfClosing, int pos) { cstart = pos + 1; MaybeErrSlashInEndTag(selfClosing); stateSave = TokenizerState.DATA; HtmlAttributes attrs = attributes ?? HtmlAttributes.EMPTY_ATTRIBUTES; if (endTag) { /* * When an end tag token is emitted, the content model flag must be * switched to the PCDATA state. */ MaybeErrAttributesOnEndTag(attrs); TokenHandler.EndTag(tagName); } else { TokenHandler.StartTag(tagName, attrs, selfClosing); } tagName = null; ResetAttributes(); /* * The token handler may have called setStateAndEndTagExpectation * and changed stateSave since the start of this method. */ return stateSave; }
/// <summary> /// Initializes a new instance of the <see cref="UnclosedBlockCommentException"/> class. /// </summary> /// <param name="tokenizerState">State of the tokenizer.</param> /// <param name="batchSource">The batch source.</param> internal UnclosedBlockCommentException(TokenizerState tokenizerState, IBatchSource batchSource) : base(tokenizerState, batchSource) { }
// ]NOCPP] // For the token handler to call /** * Sets the tokenizer state and the associated element name. This should * only ever used to put the tokenizer into one of the states that have * a special end tag expectation. * * @param specialTokenizerState * the tokenizer state to set * @param endTagExpectation * the expected end tag for transitioning back to normal */ public void SetStateAndEndTagExpectation(TokenizerState specialTokenizerState, [Local] String endTagExpectation) { this.stateSave = specialTokenizerState; if (specialTokenizerState == TokenizerState.DATA) { return; } char[] asArray = endTagExpectation.ToCharArray(); this.endTagExpectation = ElementName.ElementNameByBuffer(asArray, 0, asArray.Length); EndTagExpectationToArray(); }
IEnumerator <SqlString> IEnumerable <SqlString> .GetEnumerator() { TokenizerState state = TokenizerState.WhiteSpace; int parenthesisCount = 0; bool escapeQuote = false; int tokenStart = 0; int tokenLength = 0; string originalString = original.ToString(); for (int i = 0; i < originalString.Length; i++) { char ch = originalString[i]; switch (state) { case TokenizerState.WhiteSpace: if (ch == '\'') { state = TokenizerState.Quoted; tokenLength += 1; } else if (ch == ',') { yield return(new SqlString(",")); //tokenLength += 1? } else if (ch == '(' || ch == '[') { state = TokenizerState.InParenthesis; tokenLength += 1; parenthesisCount = 1; } else if (char.IsWhiteSpace(ch) == false) { state = TokenizerState.Token; tokenLength += 1; } break; case TokenizerState.Quoted: if (escapeQuote) { escapeQuote = false; tokenLength += 1; } // handle escaping of ' by using '' or \' else if (ch == '\\' || (ch == '\'' && i + 1 < originalString.Length && originalString[i + 1] == '\'')) { escapeQuote = true; tokenLength += 1; } else if (ch == '\'') { yield return(original.Substring(tokenStart, tokenLength)); tokenStart += tokenLength + 1; tokenLength = 0; state = TokenizerState.WhiteSpace; } else { tokenLength += 1; } break; case TokenizerState.InParenthesis: if (ch == ')' || ch == ']') { tokenLength += 1; parenthesisCount -= 1; if (parenthesisCount == 0) { yield return(original.Substring(tokenStart, tokenLength)); tokenStart += tokenLength + 1; tokenLength = 0; state = TokenizerState.WhiteSpace; } } else if (ch == '(' || ch == '[') { tokenLength += 1; parenthesisCount += 1; } else { tokenLength += 1; } break; case TokenizerState.Token: if (char.IsWhiteSpace(ch)) { yield return(original.Substring(tokenStart, tokenLength)); tokenStart += tokenLength + 1; tokenLength = 0; state = TokenizerState.WhiteSpace; } else if (ch == ',') // stop current token, and send the , as well { yield return(original.Substring(tokenStart, tokenLength)); yield return(new SqlString(",")); tokenStart += tokenLength + 2; tokenLength = 0; state = TokenizerState.WhiteSpace; } else if (ch == '(' || ch == '[') { state = TokenizerState.InParenthesis; parenthesisCount = 1; tokenLength += 1; } else if (ch == '\'') { state = TokenizerState.Quoted; tokenLength += 1; } else { tokenLength += 1; } break; default: throw new InvalidExpressionException("Could not understand the string " + original); } } if (tokenLength > 0) { yield return(original.Substring(tokenStart, tokenLength)); } }
/** * Sets the tokenizer state and the associated element name. This should * only ever used to put the tokenizer into one of the states that have * a special end tag expectation. * * @param specialTokenizerState * the tokenizer state to set * @param endTagExpectation * the expected end tag for transitioning back to normal */ public void SetStateAndEndTagExpectation(TokenizerState specialTokenizerState, ElementName endTagExpectation) { this.stateSave = specialTokenizerState; this.endTagExpectation = endTagExpectation; EndTagExpectationToArray(); }
void StateLoop3_RawText_CData_RcRef(TokenizerState state, TokenizerState returnState) { /* * Idioms used in this code: * * * Consuming the next input character * * To consume the next input character, the code does this: if (++pos == * endPos) { goto breakStateloop; } c = buf[pos]; * * * Staying in a state * * When there's a state that the tokenizer may stay in over multiple * input characters, the state has a wrapper |for(;;)| loop and staying * in the state continues the loop. * * * Switching to another state * * To switch to another state, the code sets the state variable to the * magic number of the new state. Then it either continues stateloop or * breaks out of the state's own wrapper loop if the target state is * right after the current state in source order. (This is a partial * workaround for Java's lack of goto.) * * * Reconsume support * * The spec sometimes says that an input character is reconsumed in * another state. If a state can ever be entered so that an input * character can be reconsumed in it, the state's code starts with an * |if (reconsume)| that sets reconsume to false and skips over the * normal code for consuming a new character. * * To reconsume the current character in another state, the code sets * |reconsume| to true and then switches to the other state. * * * Emitting character tokens * * This method emits character tokens lazily. Whenever a new range of * character tokens starts, the field cstart must be set to the start * index of the range. The flushChars() method must be called at the end * of a range to flush it. * * * U+0000 handling * * The various states have to handle the replacement of U+0000 with * U+FFFD. However, if U+0000 would be reconsumed in another state, the * replacement doesn't need to happen, because it's handled by the * reconsuming state. * * * LF handling * * Every state needs to increment the line number upon LF unless the LF * gets reconsumed by another state which increments the line number. * * * CR handling * * Every state needs to handle CR unless the CR gets reconsumed and is * handled by the reconsuming state. The CR needs to be handled as if it * were and LF, the lastCR field must be set to true and then this * method must return. The IO driver will then swallow the next * character if it is an LF to coalesce CRLF. */ /* * As there is no support for labeled loops in C#, instead of break <loop>; * the port uses goto break<loop>; and a label after the loop. * Instead of continue <loop>; it uses goto continue<loop>; and a label * at the beginning or end of the loop (which doesn't matter in for(;;) loops) */ /*stateloop:*/ for (; ; ) { //************* continueStateloop: //************* switch (state) { // XXX reorder point case TokenizerState.CDATA_START: { char c; while (reader.ReadNext(out c)) { if (index < 6) { // CDATA_LSQB.Length if (c == Tokenizer.CDATA_LSQB[index]) { AppendLongStrBuf(c); } else { ErrBogusComment(); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.s44_BOGUS_COMMENT; //reconsume = true; reader.StepBack(); goto continueStateloop; } index++; continue; } else { reader.StartCollect(); // start coalescing //state = Transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); state = TokenizerState.s68_CDATA_SECTION; //reconsume = true; reader.StepBack(); goto case TokenizerState.s68_CDATA_SECTION; //break; // FALL THROUGH goto continueStateloop; } } //------------------------------- //eof goto breakStateloop; //------------------------------------ } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.s68_CDATA_SECTION: /*cdatasectionloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case ']': FlushChars(); //state = Transition(state, Tokenizer.CDATA_RSQB, reconsume, pos); state = TokenizerState.CDATA_RSQB; goto breakCdatasectionloop; // FALL THROUGH case '\u0000': EmitReplacementCharacter(); continue; case '\r': EmitCarriageReturn(); goto breakStateloop; case '\n': default: continue; } } goto breakStateloop; //------------------------------------ breakCdatasectionloop: goto case TokenizerState.CDATA_RSQB; } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.CDATA_RSQB: /*cdatarsqb:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case ']': //state = Transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos); state = TokenizerState.CDATA_RSQB_RSQB; goto breakCdatarsqb; default: TokenListener.Characters(Tokenizer.RSQB_RSQB, 0, 1); reader.StartCollect(); //state = Transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); state = TokenizerState.s68_CDATA_SECTION; //reconsume = true; reader.StepBack(); goto continueStateloop; } } //------------------------------- //eof goto breakStateloop; //------------------------------------ breakCdatarsqb: goto case TokenizerState.CDATA_RSQB_RSQB; } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.CDATA_RSQB_RSQB: { char c; if (!reader.ReadNext(out c)) { goto breakStateloop; } switch (c) { case '>': //cstart = pos + 1; reader.SkipOneAndStartCollect(); //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; default: TokenListener.Characters(Tokenizer.RSQB_RSQB, 0, 2); reader.StartCollect(); //state = Transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); state = TokenizerState.s68_CDATA_SECTION; reader.StepBack(); //reconsume = true; goto continueStateloop; } } // XXX reorder point case TokenizerState.s07_PLAINTEXT: /*plaintextloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '\u0000': EmitPlaintextReplacementCharacter(); continue; case '\r': EmitCarriageReturn(); goto breakStateloop; case '\n': default: /* * Anything else Emit the current input * character as a character token. Stay in the * RAWTEXT state. */ continue; } } //------------------------------------ //eof goto breakStateloop; } // XXX reorder point case TokenizerState.s03_RCDATA: /*rcdataloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '&': /* * U+0026 AMPERSAND (&) Switch to the character * reference in RCDATA state. */ //FlushChars(buf, pos); FlushChars(); ClearStrBufAndAppend(c); additional = '\u0000'; returnState = state; //state = Transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); state = TokenizerState.CONSUME_CHARACTER_REFERENCE; goto continueStateloop; case '<': /* * U+003C LESS-THAN SIGN (<) Switch to the * RCDATA less-than sign state. */ //FlushChars(buf, pos); FlushChars(); returnState = state; //state = Transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); state = TokenizerState.s11_RAWTEXT_RCDATA_LESS_THAN_SIGN; goto continueStateloop; case '\u0000': EmitReplacementCharacter(); continue; case '\r': EmitCarriageReturn(); goto breakStateloop; case '\n': default: /* * Emit the current input character as a * character token. Stay in the RCDATA state. */ continue; } } //------------------------------------ //eof goto breakStateloop; } // XXX reorder point case TokenizerState.s05_RAWTEXT: /*rawtextloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '<': /* * U+003C LESS-THAN SIGN (<) Switch to the * RAWTEXT less-than sign state. */ FlushChars(); returnState = state; //state = Transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); state = TokenizerState.s11_RAWTEXT_RCDATA_LESS_THAN_SIGN; goto breakRawtextloop; // FALL THRU goto continueStateloop; case '\u0000': EmitReplacementCharacter(); continue; case '\r': EmitCarriageReturn(); goto breakStateloop; case '\n': default: /* * Emit the current input character as a * character token. Stay in the RAWTEXT state. */ continue; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakRawtextloop: goto case TokenizerState.s11_RAWTEXT_RCDATA_LESS_THAN_SIGN; } // XXX fallthru don't reorder case TokenizerState.s11_RAWTEXT_RCDATA_LESS_THAN_SIGN: /*rawtextrcdatalessthansignloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '/': /* * U+002F SOLIDUS (/) Set the temporary buffer * to the empty string. Switch to the script * data end tag open state. */ index = 0; ClearStrBuf(); //state = Transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); state = TokenizerState.NON_DATA_END_TAG_NAME; goto breakRawtextrcdatalessthansignloop; // FALL THRU goto continueStateloop; default: /* * Otherwise, emit a U+003C LESS-THAN SIGN * character token */ TokenListener.Characters(Tokenizer.LT_GT, 0, 1); /* * and reconsume the current input character in * the data state. */ reader.StartCollect(); //state = Transition(state, returnState, reconsume, pos); state = returnState; //reconsume = true; reader.StepBack(); goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakRawtextrcdatalessthansignloop: goto case TokenizerState.NON_DATA_END_TAG_NAME; } // XXX fall thru. don't reorder. case TokenizerState.NON_DATA_END_TAG_NAME: { char c; while (reader.ReadNext(out c)) { /* * ASSERT! when entering this state, set index to 0 and * call clearStrBuf() assert (contentModelElement != * null); Let's implement the above without lookahead. * strBuf is the 'temporary buffer'. */ if (index < endTagExpectationAsArray.Length) { char e = endTagExpectationAsArray[index]; char folded = c; if (c >= 'A' && c <= 'Z') { folded += (char)0x20; } if (folded != e) { ErrHtml4LtSlashInRcdata(folded); TokenListener.Characters(Tokenizer.LT_SOLIDUS, 0, 2); EmitStrBuf(); reader.StartCollect(); //state = Transition(state, returnState, reconsume, pos); state = returnState; //reconsume = true; reader.StepBack(); goto continueStateloop; } AppendStrBuf(c); index++; continue; } else { endTag = true; // XXX replace contentModelElement with different // type tagName = endTagExpectation; switch (c) { case '\r': SilentCarriageReturn(); //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.s34_BEFORE_ATTRIBUTE_NAME; goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE * FEED (LF) U+000C FORM FEED (FF) U+0020 * SPACE If the current end tag token is an * appropriate end tag token, then switch to * the before attribute name state. */ //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.s34_BEFORE_ATTRIBUTE_NAME; goto continueStateloop; case '/': /* * U+002F SOLIDUS (/) If the current end tag * token is an appropriate end tag token, * then switch to the self-closing start tag * state. */ //state = Transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); state = TokenizerState.s43_SELF_CLOSING_START_TAG; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) If the * current end tag token is an appropriate * end tag token, then emit the current tag * token and switch to the data state. */ //state = Transition(state, EmitCurrentTagToken(false, pos), reconsume, pos); state = EmitCurrentTagToken(false); if (shouldSuspend) { goto breakStateloop; } goto continueStateloop; default: /* * Emit a U+003C LESS-THAN SIGN character * token, a U+002F SOLIDUS character token, * a character token for each of the * characters in the temporary buffer (in * the order they were added to the buffer), * and reconsume the current input character * in the RAWTEXT state. */ // [NOCPP[ ErrWarnLtSlashInRcdata(); // ]NOCPP] TokenListener.Characters(LT_SOLIDUS, 0, 2); EmitStrBuf(); if (c == '\u0000') { EmitReplacementCharacter(); } else { reader.StartCollect(); // don't drop the // character } //state = Transition(state, returnState, reconsume, pos); state = returnState; goto continueStateloop; } } } //------------------------------------ //eof goto breakStateloop; } case TokenizerState.PROCESSING_INSTRUCTION: //processinginstructionloop: { char c; while (reader.ReadNext(out c)) { switch (c) { case '?': //state = Transition(state,Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK,reconsume, pos); state = TokenizerState.PROCESSING_INSTRUCTION_QUESTION_MARK; break; // continue stateloop; default: continue; } } //------------------------------------ //eof goto breakStateloop; } //breakProcessingInstructionLoop: case TokenizerState.PROCESSING_INSTRUCTION_QUESTION_MARK: { char c; if (!reader.ReadNext(out c)) { goto breakStateloop; } switch (c) { case '>': //state = Transition(state, Tokenizer.DATA,reconsume, pos); state = TokenizerState.s01_DATA; continue; default: //state = Transition(state,Tokenizer.PROCESSING_INSTRUCTION,reconsume, pos); state = TokenizerState.PROCESSING_INSTRUCTION; continue; } } // END HOTSPOT WORKAROUND } } // stateloop breakStateloop: //FlushChars(buf, pos); FlushChars(); /* * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; } */ // Save locals stateSave = state; returnStateSave = returnState; }
public void ResetToDataState() { strBufLen = 0; longStrBufLen = 0; stateSave = TokenizerState.DATA; // line = 1; XXX line numbers lastCR = false; index = 0; forceQuirks = false; additional = '\u0000'; entCol = -1; firstCharKey = -1; lo = 0; hi = 0; // will always be overwritten before use anyway candidate = -1; strBufMark = 0; prevValue = -1; value = 0; seenDigits = false; endTag = false; // Removed J. Treworgy 12/7/2012 - this should remain true so the parser can choose to abort //shouldSuspend = false; InitDoctypeFields(); if (tagName != null) { tagName = null; } if (attributeName != null) { attributeName = null; } // [NOCPP[ if (newAttributesEachTime) { // ]NOCPP] if (attributes != null) { attributes = null; } // [NOCPP[ } // ]NOCPP] }
void StateLoop3_DocType(TokenizerState state, TokenizerState returnState) { /* * Idioms used in this code: * * * Consuming the next input character * * To consume the next input character, the code does this: if (++pos == * endPos) { goto breakStateloop; } c = buf[pos]; * * * Staying in a state * * When there's a state that the tokenizer may stay in over multiple * input characters, the state has a wrapper |for(;;)| loop and staying * in the state continues the loop. * * * Switching to another state * * To switch to another state, the code sets the state variable to the * magic number of the new state. Then it either continues stateloop or * breaks out of the state's own wrapper loop if the target state is * right after the current state in source order. (This is a partial * workaround for Java's lack of goto.) * * * Reconsume support * * The spec sometimes says that an input character is reconsumed in * another state. If a state can ever be entered so that an input * character can be reconsumed in it, the state's code starts with an * |if (reconsume)| that sets reconsume to false and skips over the * normal code for consuming a new character. * * To reconsume the current character in another state, the code sets * |reconsume| to true and then switches to the other state. * * * Emitting character tokens * * This method emits character tokens lazily. Whenever a new range of * character tokens starts, the field cstart must be set to the start * index of the range. The flushChars() method must be called at the end * of a range to flush it. * * * U+0000 handling * * The various states have to handle the replacement of U+0000 with * U+FFFD. However, if U+0000 would be reconsumed in another state, the * replacement doesn't need to happen, because it's handled by the * reconsuming state. * * * LF handling * * Every state needs to increment the line number upon LF unless the LF * gets reconsumed by another state which increments the line number. * * * CR handling * * Every state needs to handle CR unless the CR gets reconsumed and is * handled by the reconsuming state. The CR needs to be handled as if it * were and LF, the lastCR field must be set to true and then this * method must return. The IO driver will then swallow the next * character if it is an LF to coalesce CRLF. */ /* * As there is no support for labeled loops in C#, instead of break <loop>; * the port uses goto break<loop>; and a label after the loop. * Instead of continue <loop>; it uses goto continue<loop>; and a label * at the beginning or end of the loop (which doesn't matter in for(;;) loops) */ /*stateloop:*/ for (; ; ) { //************* continueStateloop: //************* switch (state) { // XXX reorder point case TokenizerState.MARKUP_DECLARATION_OCTYPE: /*markupdeclarationdoctypeloop:*/ { char c; while (reader.ReadNext(out c)) { if (index < 6) { // OCTYPE.Length char folded = c; if (c >= 'A' && c <= 'Z') { folded += (char)0x20; } if (folded == Tokenizer.OCTYPE[index]) { AppendLongStrBuf(c); } else { ErrBogusComment(); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.s44_BOGUS_COMMENT; //reconsume = true; reader.StepBack(); goto continueStateloop; } index++; continue; } else { // state = Transition(state, Tokenizer.DOCTYPE, reconsume, pos); state = TokenizerState.s52_DOCTYPE; //reconsume = true; reader.StepBack(); goto breakMarkupdeclarationdoctypeloop; // goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakMarkupdeclarationdoctypeloop: goto case TokenizerState.s52_DOCTYPE; } // FALLTHRU DON'T REORDER case TokenizerState.s52_DOCTYPE: /*doctypeloop:*/ { char c; while (reader.ReadNext(out c)) { InitDoctypeFields(); /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); //state = Transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); state = TokenizerState.s53_BEFORE_DOCTYPE_NAME; goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * Switch to the before DOCTYPE name state. */ //state = Transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); state = TokenizerState.s53_BEFORE_DOCTYPE_NAME; goto breakDoctypeloop; // goto continueStateloop; default: /* * Anything else Parse error. */ ErrMissingSpaceBeforeDoctypeName(); /* * Reconsume the current character in the before * DOCTYPE name state. */ //state = Transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); state = TokenizerState.s53_BEFORE_DOCTYPE_NAME; //reconsume = true; reader.StepBack(); goto breakDoctypeloop; // goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakDoctypeloop: goto case TokenizerState.s53_BEFORE_DOCTYPE_NAME; } // FALLTHRU DON'T REORDER case TokenizerState.s53_BEFORE_DOCTYPE_NAME: /*beforedoctypenameloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay * in the before DOCTYPE name state. */ continue; case '>': /* * U+003E GREATER-THAN SIGN (>) Parse error. */ ErrNamelessDoctype(); /* * Create a new DOCTYPE token. Set its * force-quirks flag to on. */ forceQuirks = true; /* * Emit the token. */ EmitDoctypeToken(); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: if (c >= 'A' && c <= 'Z') { /* * U+0041 LATIN CAPITAL LETTER A through to * U+005A LATIN CAPITAL LETTER Z Create a * new DOCTYPE token. Set the token's name * to the lowercase TokenizerState.version of the input * character (add 0x0020 to the character's * code point). */ c += (char)0x20; } /* Anything else Create a new DOCTYPE token. */ /* * Set the token's name name to the current * input character. */ ClearStrBufAndAppend(c); /* * Switch to the DOCTYPE name state. */ //state = Transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos); state = TokenizerState.s54_DOCTYPE_NAME; goto breakBeforedoctypenameloop; // goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakBeforedoctypenameloop: goto case TokenizerState.s54_DOCTYPE_NAME; } // FALLTHRU DON'T REORDER case TokenizerState.s54_DOCTYPE_NAME: /*doctypenameloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '\r': SilentCarriageReturn(); StrBufToDoctypeName(); //state = Transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); state = TokenizerState.s55_AFTER_DOCTYPE_NAME; goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * Switch to the after DOCTYPE name state. */ StrBufToDoctypeName(); //state = Transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); state = TokenizerState.s55_AFTER_DOCTYPE_NAME; goto breakDoctypenameloop; // goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * DOCTYPE token. */ StrBufToDoctypeName(); EmitDoctypeToken(); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * U+0041 LATIN CAPITAL LETTER A through to * U+005A LATIN CAPITAL LETTER Z Append the * lowercase TokenizerState.version of the input character (add * 0x0020 to the character's code point) to the * current DOCTYPE token's name. */ if (c >= 'A' && c <= 'Z') { c += (char)0x0020; } /* * Anything else Append the current input * character to the current DOCTYPE token's * name. */ AppendStrBuf(c); /* * Stay in the DOCTYPE name state. */ continue; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakDoctypenameloop: goto case TokenizerState.s55_AFTER_DOCTYPE_NAME; } // FALLTHRU DON'T REORDER case TokenizerState.s55_AFTER_DOCTYPE_NAME: /*afterdoctypenameloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay * in the after DOCTYPE name state. */ continue; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * DOCTYPE token. */ EmitDoctypeToken(); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; case 'p': case 'P': index = 0; //state = Transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos); state = TokenizerState.DOCTYPE_UBLIC; goto breakAfterdoctypenameloop; // goto continueStateloop; case 's': case 'S': index = 0; //state = Transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos); state = TokenizerState.DOCTYPE_YSTEM; goto continueStateloop; default: /* * Otherwise, this is the parse error. */ BogusDoctype(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ // done by bogusDoctype(); /* * Switch to the bogus DOCTYPE state. */ //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.s67_BOGUS_DOCTYPE; goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakAfterdoctypenameloop: goto case TokenizerState.DOCTYPE_UBLIC; } // FALLTHRU DON'T REORDER case TokenizerState.DOCTYPE_UBLIC: /*doctypeublicloop:*/ { char c; while (reader.ReadNext(out c)) { /* * If the six characters starting from the current input * character are an ASCII case-insensitive match for the * word "PUBLIC", then consume those characters and * switch to the before DOCTYPE public identifier state. */ if (index < 5) { // UBLIC.Length char folded = c; if (c >= 'A' && c <= 'Z') { folded += (char)0x20; } if (folded != Tokenizer.UBLIC[index]) { BogusDoctype(); // forceQuirks = true; //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.s67_BOGUS_DOCTYPE; //reconsume = true; reader.StepBack(); goto continueStateloop; } index++; continue; } else { //state = Transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos); state = TokenizerState.s56_AFTER_DOCTYPE_PUBLIC_KEYWORD; //reconsume = true; reader.StepBack(); goto breakDoctypeublicloop; // goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakDoctypeublicloop: goto case TokenizerState.s56_AFTER_DOCTYPE_PUBLIC_KEYWORD; } // FALLTHRU DON'T REORDER case TokenizerState.s56_AFTER_DOCTYPE_PUBLIC_KEYWORD: /*afterdoctypepublickeywordloop:*/ { char c; while (reader.ReadNext(out c)) { /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); //state = Transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); state = TokenizerState.s57_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER; goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * Switch to the before DOCTYPE public * identifier state. */ //state = Transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); state = TokenizerState.s57_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER; goto breakAfterdoctypepublickeywordloop; // FALL THROUGH continue stateloop case '"': /* * U+0022 QUOTATION MARK (") Parse Error. */ ErrNoSpaceBetweenDoctypePublicKeywordAndQuote(); /* * Set the DOCTYPE token's public identifier to * the empty string (not missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE public identifier * (double-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); state = TokenizerState.s58_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED; goto continueStateloop; case '\'': /* * U+0027 APOSTROPHE (') Parse Error. */ ErrNoSpaceBetweenDoctypePublicKeywordAndQuote(); /* * Set the DOCTYPE token's public identifier to * the empty string (not missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE public identifier * (single-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); state = TokenizerState.s59_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED; goto continueStateloop; case '>': /* U+003E GREATER-THAN SIGN (>) Parse error. */ ErrExpectedPublicId(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ forceQuirks = true; /* * Emit that DOCTYPE token. */ EmitDoctypeToken(); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; default: BogusDoctype(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ // done by bogusDoctype(); /* * Switch to the bogus DOCTYPE state. */ //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.s67_BOGUS_DOCTYPE; goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakAfterdoctypepublickeywordloop: goto case TokenizerState.s57_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER; } // FALLTHRU DON'T REORDER case TokenizerState.s57_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: /*beforedoctypepublicidentifierloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay * in the before DOCTYPE public identifier * state. */ continue; case '"': /* * U+0022 QUOTATION MARK (") Set the DOCTYPE * token's public identifier to the empty string * (not missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE public identifier * (double-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); state = TokenizerState.s58_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED; goto breakBeforedoctypepublicidentifierloop; // goto continueStateloop; case '\'': /* * U+0027 APOSTROPHE (') Set the DOCTYPE token's * public identifier to the empty string (not * missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE public identifier * (single-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); state = TokenizerState.s59_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED; goto continueStateloop; case '>': /* U+003E GREATER-THAN SIGN (>) Parse error. */ ErrExpectedPublicId(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ forceQuirks = true; /* * Emit that DOCTYPE token. */ EmitDoctypeToken(); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; default: BogusDoctype(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ // done by bogusDoctype(); /* * Switch to the bogus DOCTYPE state. */ //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.s67_BOGUS_DOCTYPE; goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakBeforedoctypepublicidentifierloop: goto case TokenizerState.s58_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED; } // FALLTHRU DON'T REORDER case TokenizerState.s58_DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: /*doctypepublicidentifierdoublequotedloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '"': /* * U+0022 QUOTATION MARK (") Switch to the after * DOCTYPE public identifier state. */ publicIdentifier = LongStrBufToString(); //state = Transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); state = TokenizerState.s60_AFTER_DOCTYPE_PUBLIC_IDENTIFIER; goto breakDoctypepublicidentifierdoublequotedloop; // goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Parse error. */ ErrGtInPublicId(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ forceQuirks = true; /* * Emit that DOCTYPE token. */ publicIdentifier = LongStrBufToString(); EmitDoctypeToken(); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); continue; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Anything else Append the current input * character to the current DOCTYPE token's * public identifier. */ AppendLongStrBuf(c); /* * Stay in the DOCTYPE public identifier * (double-quoted) state. */ continue; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakDoctypepublicidentifierdoublequotedloop: goto case TokenizerState.s60_AFTER_DOCTYPE_PUBLIC_IDENTIFIER; } // FALLTHRU DON'T REORDER case TokenizerState.s60_AFTER_DOCTYPE_PUBLIC_IDENTIFIER: /*afterdoctypepublicidentifierloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '\r': SilentCarriageReturn(); //state = Transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); state = TokenizerState.s61_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS; goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * Switch to the between DOCTYPE public and * system identifiers state. */ //state = Transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); state = TokenizerState.s61_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS; goto breakAfterdoctypepublicidentifierloop; // goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * DOCTYPE token. */ EmitDoctypeToken(); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; case '"': /* * U+0022 QUOTATION MARK (") Parse error. */ ErrNoSpaceBetweenPublicAndSystemIds(); /* * Set the DOCTYPE token's system identifier to * the empty string (not missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE system identifier * (double-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); state = TokenizerState.s64_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; goto continueStateloop; case '\'': /* * U+0027 APOSTROPHE (') Parse error. */ ErrNoSpaceBetweenPublicAndSystemIds(); /* * Set the DOCTYPE token's system identifier to * the empty string (not missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE system identifier * (single-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); state = TokenizerState.s65_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; goto continueStateloop; default: BogusDoctype(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ // done by bogusDoctype(); /* * Switch to the bogus DOCTYPE state. */ //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.s67_BOGUS_DOCTYPE; goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakAfterdoctypepublicidentifierloop: goto case TokenizerState.s61_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS; } // FALLTHRU DON'T REORDER case TokenizerState.s61_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: /*betweendoctypepublicandsystemidentifiersloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay * in the between DOCTYPE public and system * identifiers state. */ continue; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * DOCTYPE token. */ EmitDoctypeToken(); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; case '"': /* * U+0022 QUOTATION MARK (") Set the DOCTYPE * token's system identifier to the empty string * (not missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE system identifier * (double-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); state = TokenizerState.s64_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; goto breakBetweendoctypepublicandsystemidentifiersloop; // goto continueStateloop; case '\'': /* * U+0027 APOSTROPHE (') Set the DOCTYPE token's * system identifier to the empty string (not * missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE system identifier * (single-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); state = TokenizerState.s65_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; goto continueStateloop; default: BogusDoctype(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ // done by bogusDoctype(); /* * Switch to the bogus DOCTYPE state. */ //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.s67_BOGUS_DOCTYPE; goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakBetweendoctypepublicandsystemidentifiersloop: goto case TokenizerState.s64_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; } // FALLTHRU DON'T REORDER case TokenizerState.s64_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: /*doctypesystemidentifierdoublequotedloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '"': /* * U+0022 QUOTATION MARK (") Switch to the after * DOCTYPE system identifier state. */ systemIdentifier = LongStrBufToString(); //state = Transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); state = TokenizerState.s66_AFTER_DOCTYPE_SYSTEM_IDENTIFIER; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Parse error. */ ErrGtInSystemId(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ forceQuirks = true; /* * Emit that DOCTYPE token. */ systemIdentifier = LongStrBufToString(); EmitDoctypeToken(); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); continue; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Anything else Append the current input * character to the current DOCTYPE token's * system identifier. */ AppendLongStrBuf(c); /* * Stay in the DOCTYPE system identifier * (double-quoted) state. */ continue; } } //------------------------------------ //eof goto breakStateloop; } // next 2 lines were unreachable; commented out //breakDoctypesystemidentifierdoublequotedloop: // goto case TokenizerState.AFTER_DOCTYPE_SYSTEM_IDENTIFIER; // FALLTHRU DON'T REORDER case TokenizerState.s66_AFTER_DOCTYPE_SYSTEM_IDENTIFIER: /*afterdoctypesystemidentifierloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay * in the after DOCTYPE system identifier state. */ continue; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * DOCTYPE token. */ EmitDoctypeToken(); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; default: /* * Switch to the bogus DOCTYPE state. (This does * not set the DOCTYPE token's force-quirks flag * to on.) */ BogusDoctypeWithoutQuirks(); //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.s67_BOGUS_DOCTYPE; goto breakAfterdoctypesystemidentifierloop; // goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakAfterdoctypesystemidentifierloop: goto case TokenizerState.s67_BOGUS_DOCTYPE; } // FALLTHRU DON'T REORDER case TokenizerState.s67_BOGUS_DOCTYPE: { char c; while (reader.ReadNext(out c)) { switch (c) { case '>': /* * U+003E GREATER-THAN SIGN (>) Emit that * DOCTYPE token. */ EmitDoctypeToken(); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': default: /* * Anything else Stay in the bogus DOCTYPE * state. */ continue; } } //------------------------------------ //eof goto breakStateloop; } // XXX reorder point case TokenizerState.DOCTYPE_YSTEM: /*doctypeystemloop:*/ { char c; while (reader.ReadNext(out c)) { /* * Otherwise, if the six characters starting from the * current input character are an ASCII case-insensitive * match for the word "SYSTEM", then consume those * characters and switch to the before DOCTYPE system * identifier state. */ if (index < 5) { // YSTEM.Length char folded = c; if (c >= 'A' && c <= 'Z') { folded += (char)0x20; } if (folded != YSTEM[index]) { BogusDoctype(); //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.s67_BOGUS_DOCTYPE; reader.StepBack(); //reconsume = true; goto continueStateloop; } index++; goto continueStateloop; } else { //state = Transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos); state = TokenizerState.s62_AFTER_DOCTYPE_SYSTEM_KEYWORD; //reconsume = true; reader.StepBack(); goto breakDoctypeystemloop; // goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakDoctypeystemloop: goto case TokenizerState.s62_AFTER_DOCTYPE_SYSTEM_KEYWORD; } // FALLTHRU DON'T REORDER case TokenizerState.s62_AFTER_DOCTYPE_SYSTEM_KEYWORD: /*afterdoctypesystemkeywordloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '\r': SilentCarriageReturn(); //state = Transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); state = TokenizerState.s63_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER; goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * Switch to the before DOCTYPE public * identifier state. */ //state = Transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); state = TokenizerState.s63_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER; goto breakAfterdoctypesystemkeywordloop; // FALL THROUGH continue stateloop case '"': /* * U+0022 QUOTATION MARK (") Parse Error. */ ErrNoSpaceBetweenDoctypeSystemKeywordAndQuote(); /* * Set the DOCTYPE token's system identifier to * the empty string (not missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE public identifier * (double-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); state = TokenizerState.s64_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; goto continueStateloop; case '\'': /* * U+0027 APOSTROPHE (') Parse Error. */ ErrNoSpaceBetweenDoctypeSystemKeywordAndQuote(); /* * Set the DOCTYPE token's public identifier to * the empty string (not missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE public identifier * (single-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); state = TokenizerState.s65_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; goto continueStateloop; case '>': /* U+003E GREATER-THAN SIGN (>) Parse error. */ ErrExpectedPublicId(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ forceQuirks = true; /* * Emit that DOCTYPE token. */ EmitDoctypeToken(); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; default: BogusDoctype(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ // done by bogusDoctype(); /* * Switch to the bogus DOCTYPE state. */ //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.s67_BOGUS_DOCTYPE; goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakAfterdoctypesystemkeywordloop: goto case TokenizerState.s63_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER; } // FALLTHRU DON'T REORDER case TokenizerState.s63_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: /*beforedoctypesystemidentifierloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay * in the before DOCTYPE system identifier * state. */ continue; case '"': /* * U+0022 QUOTATION MARK (") Set the DOCTYPE * token's system identifier to the empty string * (not missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE system identifier * (double-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); state = TokenizerState.s64_DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; goto continueStateloop; case '\'': /* * U+0027 APOSTROPHE (') Set the DOCTYPE token's * system identifier to the empty string (not * missing), */ ClearLongStrBuf(); /* * then switch to the DOCTYPE system identifier * (single-quoted) state. */ //state = Transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); state = TokenizerState.s65_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; goto breakBeforedoctypesystemidentifierloop; // goto continueStateloop; case '>': /* U+003E GREATER-THAN SIGN (>) Parse error. */ ErrExpectedSystemId(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ forceQuirks = true; /* * Emit that DOCTYPE token. */ EmitDoctypeToken(); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; default: BogusDoctype(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ // done by bogusDoctype(); /* * Switch to the bogus DOCTYPE state. */ //state = Transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); state = TokenizerState.s67_BOGUS_DOCTYPE; goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakBeforedoctypesystemidentifierloop: goto case TokenizerState.s65_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; } // FALLTHRU DON'T REORDER case TokenizerState.s65_DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: { char c; while (reader.ReadNext(out c)) { switch (c) { case '\'': /* * U+0027 APOSTROPHE (') Switch to the after * DOCTYPE system identifier state. */ systemIdentifier = LongStrBufToString(); //state = Transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); state = TokenizerState.s66_AFTER_DOCTYPE_SYSTEM_IDENTIFIER; goto continueStateloop; case '>': ErrGtInSystemId(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ forceQuirks = true; /* * Emit that DOCTYPE token. */ systemIdentifier = LongStrBufToString(); EmitDoctypeToken(); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); continue; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Anything else Append the current input * character to the current DOCTYPE token's * system identifier. */ AppendLongStrBuf(c); /* * Stay in the DOCTYPE system identifier * (double-quoted) state. */ continue; } } //------------------------------------ //eof goto breakStateloop; // XXX reorder point } case TokenizerState.s59_DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: { char c; while (reader.ReadNext(out c)) { switch (c) { case '\'': /* * U+0027 APOSTROPHE (') Switch to the after * DOCTYPE public identifier state. */ publicIdentifier = LongStrBufToString(); //state = Transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); state = TokenizerState.s60_AFTER_DOCTYPE_PUBLIC_IDENTIFIER; goto continueStateloop; case '>': ErrGtInPublicId(); /* * Set the DOCTYPE token's force-quirks flag to * on. */ forceQuirks = true; /* * Emit that DOCTYPE token. */ publicIdentifier = LongStrBufToString(); EmitDoctypeToken(); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); continue; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Anything else Append the current input * character to the current DOCTYPE token's * public identifier. */ AppendLongStrBuf(c); /* * Stay in the DOCTYPE public identifier * (single-quoted) state. */ continue; } } //------------------------------------ //eof goto breakStateloop; } // XXX reorder point case TokenizerState.PROCESSING_INSTRUCTION: //processinginstructionloop: { char c; while (reader.ReadNext(out c)) { switch (c) { case '?': //state = Transition(state,Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK,reconsume, pos); state = TokenizerState.PROCESSING_INSTRUCTION_QUESTION_MARK; break; // continue stateloop; default: continue; } } //------------------------------------ //eof goto breakStateloop; } //breakProcessingInstructionLoop: case TokenizerState.PROCESSING_INSTRUCTION_QUESTION_MARK: { char c; if (!reader.ReadNext(out c)) { goto breakStateloop; } switch (c) { case '>': //state = Transition(state, Tokenizer.DATA,reconsume, pos); state = TokenizerState.s01_DATA; continue; default: //state = Transition(state,Tokenizer.PROCESSING_INSTRUCTION,reconsume, pos); state = TokenizerState.PROCESSING_INSTRUCTION; continue; } } // END HOTSPOT WORKAROUND } } // stateloop breakStateloop: //FlushChars(buf, pos); FlushChars(); /* * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; } */ // Save locals stateSave = state; returnStateSave = returnState; }
internal static bool TryParse(IList <string> items, TokenizerArgs args, out T to) { if (items == null) { throw new ArgumentNullException("items"); } else if (args == null) { throw new ArgumentNullException("args"); } List <string> cArgs = new List <string>(items); using (TokenizerState <T> state = Tokenizer.NewState <T>(args)) { TokenizerDefinition definition = state.Definition; to = null; int i; bool atEnd = false; char[] checkChars = args.PlusMinSuffixArguments ? new char[] { args.ArgumentValueSeparator, '+', '-' } : new char[] { args.ArgumentValueSeparator }; int nPlaced = 0; for (i = 0; i < cArgs.Count; i++) { string a = cArgs[i]; if (!atEnd && (a.Length > 1) && args.CommandLineChars.Contains(a[0])) { bool twoStart = a[0] == a[1]; if (a.Length == 2 && twoStart) { if (!definition.HasPlacedArguments) { args.ErrorMessage = TokenizerMessages.NoPlacedArgumentsDefined; return(false); } atEnd = true; } else { int aFrom = twoStart ? 2 : 1; int aTo = args.AllowDirectArgs ? a.IndexOfAny(checkChars, aFrom) : -1; char cTo = (aTo > 0) ? a[aTo] : '\0'; string item = (aTo > 0) ? a.Substring(aFrom, aTo - aFrom) : a.Substring(aFrom); TokenItem token; string value = null; if (definition.TryGetToken(item, args.CaseSensitive, out token)) { if (token.RequiresValue) { if (i + 1 < cArgs.Count) { token.Evaluate(cArgs[++i], state); } else { args.ErrorMessage = TokenizerMessages.RequiredArgumentValueIsMissing; return(false); } } else { token.Evaluate(null, state); } continue; } else { // Look for a shorter argument for (int ii = item.Length - 1; ii > 0; ii--) { if (definition.TryGetToken(item.Substring(0, ii), args.CaseSensitive, out token) && token.AllowDirectValue(item.Substring(ii), state)) { token.EvaluateDirect(item.Substring(ii), state); break; } else { token = null; } } } if (token == null) { args.ErrorMessage = string.Format(CultureInfo.InvariantCulture, TokenizerMessages.UnknownArgumentX, a); return(false); } if (token.RequiresValue && value == null) { if (i < cArgs.Count - 1) { value = cArgs[i++]; } else { args.ErrorMessage = string.Format(CultureInfo.InvariantCulture, TokenizerMessages.ValueExpectedForArgumentX, a); return(false); } } continue; } } else if (!atEnd && args.AllowResponseFile && a.Length > 1 && a[0] == '@') { string file = a.Substring(1); if (!File.Exists(file)) { args.ErrorMessage = string.Format(CultureInfo.InvariantCulture, TokenizerMessages.ResponseFileXNotFound, file); return(false); } using (StreamReader sr = File.OpenText(a.Substring(1))) { string line; int n = i + 1; while (null != (line = sr.ReadLine())) { line = line.TrimStart(); if (line.Length > 1) { if (line[0] != '#') { foreach (string word in Tokenizer.GetCommandlineWords(line)) { cArgs.Insert(n++, word); } } } } } continue; } else if (!args.AllowNamedBetweenPlaced) { atEnd = true; } if (state.Definition.HasPlacedArguments) { if (nPlaced < state.Definition.PlacedItems.Count) { state.Definition.PlacedItems[nPlaced].Evaluate(cArgs[i], state); nPlaced++; } else if (state.Definition.RestToken != null) { state.Definition.RestToken.Evaluate(cArgs[i], state); } else { args.ErrorMessage = string.Format(CultureInfo.InvariantCulture, TokenizerMessages.UnknownArgumentX, cArgs[i]); return(false); } } } if (!state.IsComplete) { return(false); } to = state.Instance; return(true); } }
public TokenizerRule(TokenizerState state, IEnumerable<string> applicableData, TokenType type, Action<Stack<TokenizerState>> stateChange) : this(new[] { state }, applicableData, type, stateChange) { }
public Token ReadNextToken() { if(position == input.Length) { return null; } if(state != TokenizerState.PositionalValues && input[position] == EscapeMarker) { ReadNextString(); state = TokenizerState.PositionalValues; } if(state == TokenizerState.PositionalValues) { return new PositionalArgumentToken(ReadNextString(), GetCurrentPosition()); } var location = GetCurrentPosition(); var c = ReadChar(); if(state == TokenizerState.ShortName) { if(c == Tokenizer.EndOfString) { state = TokenizerState.Normal; return ReadNextToken(); } return new ShortNameToken(c, location); } if(c == FlagCharacter) { var f = PeekChar(); if(f == FlagCharacter) { // we already peeked it, so just move to the next char ReadChar(); return new LongNameToken(ReadUntilChar(Tokenizer.EndOfString, AssignmentOperator), location); } state = TokenizerState.ShortName; return ReadNextToken(); } else { var value = ReadNextString(); return (value == null) ? null : new PositionalArgumentToken(value, location); } }
public TokenizerRule(TokenizerState state, string applicableData, TokenType type, Action<Stack<TokenizerState>> stateChange) : this(new[] { state }, new[] { applicableData }, type, stateChange) { }
void StateLoop3_Comment(TokenizerState state, TokenizerState returnState) { /* * Idioms used in this code: * * * Consuming the next input character * * To consume the next input character, the code does this: if (++pos == * endPos) { goto breakStateloop; } c = buf[pos]; * * * Staying in a state * * When there's a state that the tokenizer may stay in over multiple * input characters, the state has a wrapper |for(;;)| loop and staying * in the state continues the loop. * * * Switching to another state * * To switch to another state, the code sets the state variable to the * magic number of the new state. Then it either continues stateloop or * breaks out of the state's own wrapper loop if the target state is * right after the current state in source order. (This is a partial * workaround for Java's lack of goto.) * * * Reconsume support * * The spec sometimes says that an input character is reconsumed in * another state. If a state can ever be entered so that an input * character can be reconsumed in it, the state's code starts with an * |if (reconsume)| that sets reconsume to false and skips over the * normal code for consuming a new character. * * To reconsume the current character in another state, the code sets * |reconsume| to true and then switches to the other state. * * * Emitting character tokens * * This method emits character tokens lazily. Whenever a new range of * character tokens starts, the field cstart must be set to the start * index of the range. The flushChars() method must be called at the end * of a range to flush it. * * * U+0000 handling * * The various states have to handle the replacement of U+0000 with * U+FFFD. However, if U+0000 would be reconsumed in another state, the * replacement doesn't need to happen, because it's handled by the * reconsuming state. * * * LF handling * * Every state needs to increment the line number upon LF unless the LF * gets reconsumed by another state which increments the line number. * * * CR handling * * Every state needs to handle CR unless the CR gets reconsumed and is * handled by the reconsuming state. The CR needs to be handled as if it * were and LF, the lastCR field must be set to true and then this * method must return. The IO driver will then swallow the next * character if it is an LF to coalesce CRLF. */ /* * As there is no support for labeled loops in C#, instead of break <loop>; * the port uses goto break<loop>; and a label after the loop. * Instead of continue <loop>; it uses goto continue<loop>; and a label * at the beginning or end of the loop (which doesn't matter in for(;;) loops) */ /*stateloop:*/ for (; ; ) { //************* continueStateloop: //************* switch (state) { case TokenizerState.s45_MARKUP_DECLARATION_OPEN: /*markupdeclarationopenloop:*/ { char c; while (reader.ReadNext(out c)) { /* * If the next two characters are both U+002D * HYPHEN-MINUS characters (-), consume those two * characters, create a comment token whose data is the * empty string, and switch to the comment start state. * * Otherwise, if the next seven characters are an ASCII * case-insensitive match for the word "DOCTYPE", then * consume those characters and switch to the DOCTYPE * state. * * Otherwise, if the insertion mode is * "in foreign content" and the current node is not an * element in the HTML namespace and the next seven * characters are an case-sensitive match for the string * "[CDATA[" (the five uppercase TokenizerState.letters "CDATA" with a * U+005B LEFT SQUARE BRACKET character before and * after), then consume those characters and switch to * the CDATA section state. * * Otherwise, is is a parse error. Switch to the bogus * comment state. The next character that is consumed, * if any, is the first character that will be in the * comment. */ switch (c) { case '-': ClearLongStrBufAndAppend(c); //state = Transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos); state = TokenizerState.MARKUP_DECLARATION_HYPHEN; goto breakMarkupdeclarationopenloop; // goto continueStateloop; case 'd': case 'D': ClearLongStrBufAndAppend(c); index = 0; //state = Transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos); state = TokenizerState.MARKUP_DECLARATION_OCTYPE; goto continueStateloop; case '[': if (TokenListener.IsCDataSectionAllowed) { ClearLongStrBufAndAppend(c); index = 0; //state = Transition(state, Tokenizer.CDATA_START, reconsume, pos); state = TokenizerState.CDATA_START; goto continueStateloop; } else { // else fall through goto default; } default: ErrBogusComment(); ClearLongStrBuf(); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.s44_BOGUS_COMMENT; //reconsume = true; reader.StepBack(); goto continueStateloop; } } //------------------------------- //eof goto breakStateloop; //------------------------------------ breakMarkupdeclarationopenloop: goto case TokenizerState.MARKUP_DECLARATION_HYPHEN; } // FALLTHRU DON'T REORDER case TokenizerState.MARKUP_DECLARATION_HYPHEN: /*markupdeclarationhyphenloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '\u0000': goto breakStateloop; case '-': ClearLongStrBuf(); //state = Transition(state, Tokenizer.COMMENT_START, reconsume, pos); state = TokenizerState.s46_COMMENT_START; goto breakMarkupdeclarationhyphenloop; // goto continueStateloop; default: ErrBogusComment(); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.s44_BOGUS_COMMENT; //reconsume = true; reader.StepBack(); goto continueStateloop; } } //------------------------------- //eof goto breakStateloop; //------------------------------------ breakMarkupdeclarationhyphenloop: goto case TokenizerState.s46_COMMENT_START; } // FALLTHRU DON'T REORDER case TokenizerState.s46_COMMENT_START: /*commentstartloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Switch to the comment * start dash state. */ AppendLongStrBuf(c); //state = Transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos); state = TokenizerState.s47_COMMENT_START_DASH; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Parse error. */ ErrPrematureEndOfComment(); /* Emit the comment token. */ EmitComment(0); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); // state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.s48_COMMENT; goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.s48_COMMENT; goto breakCommentstartloop; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Anything else Append the input character to * the comment token's data. */ AppendLongStrBuf(c); /* * Switch to the comment state. */ //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.s48_COMMENT; goto breakCommentstartloop; // goto continueStateloop; } } //------------------------------- //eof goto breakStateloop; //------------------------------------ breakCommentstartloop: goto case TokenizerState.s48_COMMENT; } // FALLTHRU DON'T REORDER case TokenizerState.s48_COMMENT: /*commentloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Switch to the comment * end dash state */ AppendLongStrBuf(c); //state = Transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); state = TokenizerState.s49_COMMENT_END_DASH; goto breakCommentloop; // goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); continue; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Anything else Append the input character to * the comment token's data. */ AppendLongStrBuf(c); /* * Stay in the comment state. */ continue; } } //------------------------------- //eof goto breakStateloop; //------------------------------------ breakCommentloop: goto case TokenizerState.s49_COMMENT_END_DASH; } // FALLTHRU DON'T REORDER case TokenizerState.s49_COMMENT_END_DASH: /*commentenddashloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Switch to the comment * end state */ AppendLongStrBuf(c); //state = Transition(state, Tokenizer.COMMENT_END, reconsume, pos); state = TokenizerState.s50_COMMENT_END; goto breakCommentenddashloop; // goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.s48_COMMENT; goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.s48_COMMENT; goto continueStateloop; case '\u0000': c = '\uFFFD'; goto default; // fall thru default: /* * Anything else Append a U+002D HYPHEN-MINUS * (-) character and the input character to the * comment token's data. */ AppendLongStrBuf(c); /* * Switch to the comment state. */ //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.s48_COMMENT; goto continueStateloop; } } //------------------------------- //eof goto breakStateloop; //------------------------------------ breakCommentenddashloop: goto case TokenizerState.s50_COMMENT_END; } // FALLTHRU DON'T REORDER case TokenizerState.s50_COMMENT_END: /*commentendloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the comment * token. */ EmitComment(2); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; case '-': /* U+002D HYPHEN-MINUS (-) Parse error. */ /* * Append a U+002D HYPHEN-MINUS (-) character to * the comment token's data. */ AdjustDoubleHyphenAndAppendToLongStrBufAndErr(c); /* * Stay in the comment end state. */ continue; case '\r': AdjustDoubleHyphenAndAppendToLongStrBufCarriageReturn(); //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.s48_COMMENT; goto breakStateloop; case '\n': AdjustDoubleHyphenAndAppendToLongStrBufLineFeed(); //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.s48_COMMENT; goto continueStateloop; case '!': ErrHyphenHyphenBang(); AppendLongStrBuf(c); //state = Transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos); state = TokenizerState.s51_COMMENT_END_BANG; goto continueStateloop; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Append two U+002D HYPHEN-MINUS (-) characters * and the input character to the comment * token's data. */ AdjustDoubleHyphenAndAppendToLongStrBufAndErr(c); /* * Switch to the comment state. */ //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.s48_COMMENT; goto continueStateloop; } } //------------------------------- //eof goto breakStateloop; } // XXX reorder point case TokenizerState.s51_COMMENT_END_BANG: { char c; while (reader.ReadNext(out c)) { switch (c) { case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the comment * token. */ EmitComment(3); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; case '-': /* * Append two U+002D HYPHEN-MINUS (-) characters * and a U+0021 EXCLAMATION MARK (!) character * to the comment token's data. */ AppendLongStrBuf(c); /* * Switch to the comment end dash state. */ //state = Transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); state = TokenizerState.s49_COMMENT_END_DASH; goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); continue; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Anything else Append two U+002D HYPHEN-MINUS * (-) characters, a U+0021 EXCLAMATION MARK (!) * character, and the input character to the * comment token's data. Switch to the comment * state. */ AppendLongStrBuf(c); /* * Switch to the comment state. */ //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.s48_COMMENT; goto continueStateloop; } } //------------------------------- //eof goto breakStateloop; } // XXX reorder point case TokenizerState.s47_COMMENT_START_DASH: { char c; if (!reader.ReadNext(out c)) { //------------------------------- //eof goto breakStateloop; } //---------------------- switch (c) { case '-': /* * U+002D HYPHEN-MINUS (-) Switch to the comment end * state */ AppendLongStrBuf(c); //state = Transition(state, Tokenizer.COMMENT_END, reconsume, pos); state = TokenizerState.s50_COMMENT_END; goto continueStateloop; case '>': ErrPrematureEndOfComment(); /* Emit the comment token. */ EmitComment(1); /* * Switch to the data state. */ //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.s48_COMMENT; goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.s48_COMMENT; goto continueStateloop; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Append a U+002D HYPHEN-MINUS character (-) and * the current input character to the comment * token's data. */ AppendLongStrBuf(c); /* * Switch to the comment state. */ //state = Transition(state, Tokenizer.COMMENT, reconsume, pos); state = TokenizerState.s48_COMMENT; goto continueStateloop; } } // XXX reorder point case TokenizerState.CDATA_START: { char c; while (reader.ReadNext(out c)) { if (index < 6) { // CDATA_LSQB.Length if (c == Tokenizer.CDATA_LSQB[index]) { AppendLongStrBuf(c); } else { ErrBogusComment(); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.s44_BOGUS_COMMENT; //reconsume = true; reader.StepBack(); goto continueStateloop; } index++; continue; } else { reader.StartCollect(); // start coalescing //state = Transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); state = TokenizerState.s68_CDATA_SECTION; //reconsume = true; reader.StepBack(); goto case TokenizerState.s68_CDATA_SECTION; //break; // FALL THROUGH goto continueStateloop; } } //------------------------------- //eof goto breakStateloop; //------------------------------------ } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.s68_CDATA_SECTION: /*cdatasectionloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case ']': FlushChars(); //state = Transition(state, Tokenizer.CDATA_RSQB, reconsume, pos); state = TokenizerState.CDATA_RSQB; goto breakCdatasectionloop; // FALL THROUGH case '\u0000': EmitReplacementCharacter(); continue; case '\r': EmitCarriageReturn(); goto breakStateloop; case '\n': default: continue; } } goto breakStateloop; //------------------------------------ breakCdatasectionloop: goto case TokenizerState.CDATA_RSQB; } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.CDATA_RSQB: /*cdatarsqb:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case ']': //state = Transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos); state = TokenizerState.CDATA_RSQB_RSQB; goto breakCdatarsqb; default: TokenListener.Characters(Tokenizer.RSQB_RSQB, 0, 1); reader.StartCollect(); //state = Transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); state = TokenizerState.s68_CDATA_SECTION; //reconsume = true; reader.StepBack(); goto continueStateloop; } } //------------------------------- //eof goto breakStateloop; //------------------------------------ breakCdatarsqb: goto case TokenizerState.CDATA_RSQB_RSQB; } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.CDATA_RSQB_RSQB: { char c; if (!reader.ReadNext(out c)) { goto breakStateloop; } switch (c) { case '>': //cstart = pos + 1; reader.SkipOneAndStartCollect(); //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; default: TokenListener.Characters(Tokenizer.RSQB_RSQB, 0, 2); reader.StartCollect(); //state = Transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); state = TokenizerState.s68_CDATA_SECTION; reader.StepBack(); //reconsume = true; goto continueStateloop; } } // XXX reorder point // BEGIN HOTSPOT WORKAROUND case TokenizerState.s44_BOGUS_COMMENT: /*boguscommentloop:*/ { char c; while (reader.ReadNext(out c)) { /* * Consume every character up to and including the first * U+003E GREATER-THAN SIGN character (>) or the end of * the file (EOF), whichever comes first. Emit a comment * token whose data is the concatenation of all the * characters starting from and including the character * that caused the state machine to switch into the * bogus comment state, up to and including the * character immediately before the last consumed * character (i.e. up to the character just before the * U+003E or EOF character). (If the comment was started * by the end of the file (EOF), the token is empty.) * * Switch to the data state. * * If the end of the file was reached, reconsume the EOF * character. */ switch (c) { case '>': EmitComment(0); //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; case '-': AppendLongStrBuf(c); //state = Transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos); state = TokenizerState.BOGUS_COMMENT_HYPHEN; goto breakBoguscommentloop; case '\r': AppendLongStrBufCarriageReturn(); goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); continue; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: AppendLongStrBuf(c); continue; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakBoguscommentloop: goto case TokenizerState.BOGUS_COMMENT_HYPHEN; } // FALLTHRU DON'T REORDER case TokenizerState.BOGUS_COMMENT_HYPHEN: /*boguscommenthyphenloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '>': // [NOCPP[ MaybeAppendSpaceToBogusComment(); // ]NOCPP] EmitComment(0); //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; case '-': AppendSecondHyphenToBogusComment(); goto continueBoguscommenthyphenloop; case '\r': AppendLongStrBufCarriageReturn(); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.s44_BOGUS_COMMENT; goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.s44_BOGUS_COMMENT; goto continueStateloop; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: AppendLongStrBuf(c); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.s44_BOGUS_COMMENT; goto continueStateloop; } //------------------------------------ continueBoguscommenthyphenloop: continue; } //------------------------------------ //eof goto breakStateloop; } // END HOTSPOT WORKAROUND } } // stateloop breakStateloop: //FlushChars(buf, pos); FlushChars(); /* * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; } */ // Save locals stateSave = state; returnStateSave = returnState; }
private static Token[] Init(string text, bool allowComments) { List <Token> tokens = new List <Token>(); // This is a hack to prevent the need to "close" token/state types. // If the state is not NONE at the end, then either a comment or string was left open. text += "\n"; int length = text.Length; TokenizerState state = TokenizerState.NONE; List <string> tokenBuilder = new List <string>(); int tokenStart = 0; int[] lines = new int[length]; int[] columns = new int[length]; int line = 1; int column = 1; for (int i = 0; i < length; ++i) { lines[i] = line; columns[i] = column++; if (text[i] == '\n') { line++; column = 1; } } char stringType = '\0'; char c; for (int i = 0; i < length; ++i) { c = text[i]; switch (state) { case TokenizerState.NONE: switch (c) { case ' ': case '\r': case '\n': case '\t': // skip whitespace break; case '"': case '\'': stringType = c; tokenStart = i; state = TokenizerState.STRING; break; case '/': if (allowComments && i + 1 < length && text[i + 1] == '*') { state = TokenizerState.COMMENT; i++; // do not allow /*/ as a self-closing comment. } else { // Go ahead and add as a token and let the parser throw the exception. tokens.Add(new Token("/", lines[i], columns[i])); } break; default: // numbers, unquoted strings, nulls, booleans if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '.' || c == '-') { state = TokenizerState.WORD; tokenStart = i; } else { // Either a JSON syntax character (like a bracket, comma, or colon) or an error. tokens.Add(new Token(c.ToString(), lines[i], columns[i])); } break; } break; case TokenizerState.COMMENT: if (c == '*') { if (i + 1 < length && text[i + 1] == '/') { ++i; // skip the slash state = TokenizerState.NONE; } } break; case TokenizerState.STRING: if (c == stringType) { tokens.Add(new Token(text.Substring(tokenStart, i - tokenStart + 1), lines[tokenStart], columns[tokenStart])); state = TokenizerState.NONE; } else if (c == '\\') { i++; } break; case TokenizerState.WORD: if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '.' || c == '-') { // continue the word. Make sure this expression always matches the one above it in the NONE state. } else { tokens.Add(new Token(text.Substring(tokenStart, i - tokenStart), lines[tokenStart], columns[tokenStart])); --i; state = TokenizerState.NONE; } break; } } if (state != TokenizerState.NONE) { if (state == TokenizerState.COMMENT) { throw new JsonParserException("Unexpected EOF detected. A comment seems to be left unclosed."); } throw new JsonParserException("Unexpected EOF detected. A string seems to be left unclosed."); } return(tokens.ToArray()); }
/// <summary> /// Gets the next token. /// </summary> /// <returns> /// A <see cref="Token" /> or <c>null</c> at end of string. /// </returns> public Token GetNextToken() { this.valueBuilder.Clear(); var previousState = this.State; while (this.position < this.lineChars.Length) { // Peek current position. var s1 = new string(this.lineChars[this.position], 1); // Peek 2 character string at current position. var s2 = new string( this.lineChars.Skip(this.position).Take(Math.Min(2, this.lineChars.Length - this.position)) .ToArray()); switch (this.State) { case TokenizerState.None: // Look for start of string literal switch (s1) { case SingleQuote: this.State = TokenizerState.SingleQuoteString; this.Consume(); continue; case DoubleQuote: this.State = TokenizerState.DoubleQuoteString; this.Consume(); continue; } // Look for start/end of comments switch (s2) { case LineComment: this.State = TokenizerState.LineComment; if (this.position == 0) { // If at start of line, continue parsing comment continue; } break; case BlockCommentStart: this.State = TokenizerState.BlockComment; if (this.position == 0) { // If at start of line, continue parsing comment continue; } break; case BlockCommentEnd: throw new InvalidOperationException("Found BlockCommentEnd when not within a block comment"); default: this.Consume(); continue; } break; case TokenizerState.BlockComment: // Look for block comment end if (s2 == BlockCommentEnd) { this.State = TokenizerState.None; this.Consume(2); } else { this.Consume(); continue; } break; case TokenizerState.LineComment: // While in a line comment, consume till end of string this.Consume(); continue; case TokenizerState.SingleQuoteString: if (s2 == QuoteQuote) { // QuoteQuote within single quote string literal is part of the string this.Consume(2); continue; } if (s1 == SingleQuote) { // End of string literal this.State = TokenizerState.None; } this.Consume(); continue; case TokenizerState.DoubleQuoteString: if (s1 == DoubleQuote) { // End of string literal this.State = TokenizerState.None; } this.Consume(); continue; } // If we get here, the token is ready to be emitted break; } var value = this.valueBuilder.ToString(); if (value.Length == 0) { // End of line if (this.State == TokenizerState.LineComment) { // Thus end of line comment this.State = TokenizerState.None; } return(null); } if (previousState == TokenizerState.BlockComment || previousState == TokenizerState.LineComment) { return(new Token(TokenType.Comment, value)); } return(new Token(TokenType.Text, value)); }
void StateLoop3_Tag(TokenizerState state, TokenizerState returnState) { /* * Idioms used in this code: * * * Consuming the next input character * * To consume the next input character, the code does this: if (++pos == * endPos) { goto breakStateloop; } c = buf[pos]; * * * Staying in a state * * When there's a state that the tokenizer may stay in over multiple * input characters, the state has a wrapper |for(;;)| loop and staying * in the state continues the loop. * * * Switching to another state * * To switch to another state, the code sets the state variable to the * magic number of the new state. Then it either continues stateloop or * breaks out of the state's own wrapper loop if the target state is * right after the current state in source order. (This is a partial * workaround for Java's lack of goto.) * * * Reconsume support * * The spec sometimes says that an input character is reconsumed in * another state. If a state can ever be entered so that an input * character can be reconsumed in it, the state's code starts with an * |if (reconsume)| that sets reconsume to false and skips over the * normal code for consuming a new character. * * To reconsume the current character in another state, the code sets * |reconsume| to true and then switches to the other state. * * * Emitting character tokens * * This method emits character tokens lazily. Whenever a new range of * character tokens starts, the field cstart must be set to the start * index of the range. The flushChars() method must be called at the end * of a range to flush it. * * * U+0000 handling * * The various states have to handle the replacement of U+0000 with * U+FFFD. However, if U+0000 would be reconsumed in another state, the * replacement doesn't need to happen, because it's handled by the * reconsuming state. * * * LF handling * * Every state needs to increment the line number upon LF unless the LF * gets reconsumed by another state which increments the line number. * * * CR handling * * Every state needs to handle CR unless the CR gets reconsumed and is * handled by the reconsuming state. The CR needs to be handled as if it * were and LF, the lastCR field must be set to true and then this * method must return. The IO driver will then swallow the next * character if it is an LF to coalesce CRLF. */ /* * As there is no support for labeled loops in C#, instead of break <loop>; * the port uses goto break<loop>; and a label after the loop. * Instead of continue <loop>; it uses goto continue<loop>; and a label * at the beginning or end of the loop (which doesn't matter in for(;;) loops) */ /*stateloop:*/ for (; ; ) { //************* continueStateloop: //************* switch (state) { case TokenizerState.s01_DATA: /*dataloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '&': /* * U+0026 AMPERSAND (&) Switch to the character * reference in data state. */ FlushChars(); ClearStrBufAndAppend(c); SetAdditionalAndRememberAmpersandLocation('\u0000'); returnState = state; //state = Transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); state = TokenizerState.CONSUME_CHARACTER_REFERENCE; goto continueStateloop; case '<': /* * U+003C LESS-THAN SIGN (<) Switch to the tag * open state. */ FlushChars(); //state = Transition(state, Tokenizer.TAG_OPEN, reconsume, pos); state = TokenizerState.s08_TAG_OPEN; goto breakDataloop; // FALL THROUGH continue // stateloop; case '\u0000': EmitReplacementCharacter(); continue; case '\r': EmitCarriageReturn(); goto breakStateloop; case '\n': default: /* * Anything else Emit the input character as a * character token. * * Stay in the data state. */ continue; } } //------------------------------------ //eof goto breakStateloop; //------------ breakDataloop: goto case TokenizerState.s08_TAG_OPEN; //------------ } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.s08_TAG_OPEN: /*tagopenloop:*/ { char c; while (reader.ReadNext(out c)) { /* * The behavior of this state depends on the content * model flag. */ /* * If the content model flag is set to the PCDATA state * Consume the next input character: */ if (c >= 'A' && c <= 'Z') { /* * U+0041 LATIN CAPITAL LETTER A through to U+005A * LATIN CAPITAL LETTER Z Create a new start tag * token, */ endTag = false; /* * set its tag name to the lowercase TokenizerState.version of the * input character (add 0x0020 to the character's * code point), */ ClearStrBufAndAppend((char)(c + 0x20)); /* then switch to the tag name state. */ //state = Transition(state, Tokenizer.TAG_NAME, reconsume, pos); state = TokenizerState.s10_TAG_NAME; /* * (Don't emit the token yet; further details will * be filled in before it is emitted.) */ goto breakTagopenloop; // goto continueStateloop; } else if (c >= 'a' && c <= 'z') { /* * U+0061 LATIN SMALL LETTER A through to U+007A * LATIN SMALL LETTER Z Create a new start tag * token, */ endTag = false; /* * set its tag name to the input character, */ ClearStrBufAndAppend(c); /* then switch to the tag name state. */ //state = Transition(state, Tokenizer.TAG_NAME, reconsume, pos); state = TokenizerState.s10_TAG_NAME; /* * (Don't emit the token yet; further details will * be filled in before it is emitted.) */ goto breakTagopenloop; // goto continueStateloop; } switch (c) { case '!': /* * U+0021 EXCLAMATION MARK (!) Switch to the * markup declaration open state. */ //state = Transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos); state = TokenizerState.s45_MARKUP_DECLARATION_OPEN; goto continueStateloop; case '/': /* * U+002F SOLIDUS (/) Switch to the close tag * open state. */ //state = Transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos); state = TokenizerState.s09_CLOSE_TAG_OPEN; goto continueStateloop; case '?': /* * U+003F QUESTION MARK (?) Parse error. */ ErrProcessingInstruction(); /* * Switch to the bogus comment state. */ ClearLongStrBufAndAppend(c); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.s44_BOGUS_COMMENT; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Parse error. */ ErrLtGt(); /* * Emit a U+003C LESS-THAN SIGN character token * and a U+003E GREATER-THAN SIGN character * token. */ TokenListener.Characters(LT_GT, 0, 2); /* Switch to the data state. */ //cstart = pos + 1; reader.SkipOneAndStartCollect(); //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; default: /* * Anything else Parse error. */ ErrBadCharAfterLt(c); /* * Emit a U+003C LESS-THAN SIGN character token */ TokenListener.Characters(LT_GT, 0, 1); /* * and reconsume the current input character in * the data state. */ reader.StartCollect(); //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; //reconsume = true; reader.StepBack(); goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakTagopenloop: goto case TokenizerState.s10_TAG_NAME; } // FALL THROUGH DON'T REORDER case TokenizerState.s10_TAG_NAME: /*tagnameloop:*/ { char c; while (reader.ReadNext(out c)) { /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); StrBufToElementNameString(); //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.s34_BEFORE_ATTRIBUTE_NAME; goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * Switch to the before attribute name state. */ StrBufToElementNameString(); //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.s34_BEFORE_ATTRIBUTE_NAME; goto breakTagnameloop; // goto continueStateloop; case '/': /* * U+002F SOLIDUS (/) Switch to the self-closing * start tag state. */ StrBufToElementNameString(); //state = Transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); state = TokenizerState.s43_SELF_CLOSING_START_TAG; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * tag token. */ StrBufToElementNameString(); //state = Transition(state, EmitCurrentTagToken(false, pos), reconsume, pos); state = EmitCurrentTagToken(false); if (shouldSuspend) { goto breakStateloop; } /* * Switch to the data state. */ goto continueStateloop; case '\u0000': c = '\uFFFD'; goto default; // fall thru default: if (c >= 'A' && c <= 'Z') { /* * U+0041 LATIN CAPITAL LETTER A through to * U+005A LATIN CAPITAL LETTER Z Append the * lowercase TokenizerState.version of the current input * character (add 0x0020 to the character's * code point) to the current tag token's * tag name. */ c += (char)0x20; } /* * Anything else Append the current input * character to the current tag token's tag * name. */ AppendStrBuf(c); /* * Stay in the tag name state. */ continue; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakTagnameloop: goto case TokenizerState.s34_BEFORE_ATTRIBUTE_NAME; } // FALLTHRU DON'T REORDER case TokenizerState.s34_BEFORE_ATTRIBUTE_NAME: /*beforeattributenameloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay * in the before attribute name state. */ continue; case '/': /* * U+002F SOLIDUS (/) Switch to the self-closing * start tag state. */ //state = Transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); state = TokenizerState.s43_SELF_CLOSING_START_TAG; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * tag token. */ //state = Transition(state, EmitCurrentTagToken(false, pos), reconsume, pos); state = EmitCurrentTagToken(false); if (shouldSuspend) { goto breakStateloop; } /* * Switch to the data state. */ goto continueStateloop; case '\u0000': c = '\uFFFD'; // fall thru goto case '\"'; case '\"': case '\'': case '<': case '=': /* * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS * SIGN (=) Parse error. */ ErrBadCharBeforeAttributeNameOrNull(c); /* * Treat it as per the "anything else" entry * below. */ goto default; default: /* * Anything else Start a new attribute in the * current tag token. */ if (c >= 'A' && c <= 'Z') { /* * U+0041 LATIN CAPITAL LETTER A through to * U+005A LATIN CAPITAL LETTER Z Set that * attribute's name to the lowercase TokenizerState.version * of the current input character (add * 0x0020 to the character's code point) */ c += (char)0x20; } /* * Set that attribute's name to the current * input character, */ ClearStrBufAndAppend(c); /* * and its value to the empty string. */ // Will do later. /* * Switch to the attribute name state. */ //state = Transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.s35_ATTRIBUTE_NAME; goto breakBeforeattributenameloop; // goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakBeforeattributenameloop: goto case TokenizerState.s35_ATTRIBUTE_NAME; } // FALLTHRU DON'T REORDER case TokenizerState.s35_ATTRIBUTE_NAME: /*attributenameloop:*/ { char c; while (reader.ReadNext(out c)) { /* * Consume the next input character: */ switch (c) { case '\r': SilentCarriageReturn(); AttributeNameComplete(); //state = Transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.s36_AFTER_ATTRIBUTE_NAME; goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * Switch to the after attribute name state. */ AttributeNameComplete(); //state = Transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.s36_AFTER_ATTRIBUTE_NAME; goto continueStateloop; case '/': /* * U+002F SOLIDUS (/) Switch to the self-closing * start tag state. */ AttributeNameComplete(); AddAttributeWithoutValue(); //state = Transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); state = TokenizerState.s43_SELF_CLOSING_START_TAG; goto continueStateloop; case '=': /* * U+003D EQUALS SIGN (=) Switch to the before * attribute value state. */ AttributeNameComplete(); //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); state = TokenizerState.s37_BEFORE_ATTRIBUTE_VALUE; goto breakAttributenameloop; // goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * tag token. */ AttributeNameComplete(); AddAttributeWithoutValue(); //state = Transition(state, EmitCurrentTagToken(false, pos), reconsume, pos); state = EmitCurrentTagToken(false); if (shouldSuspend) { goto breakStateloop; } /* * Switch to the data state. */ goto continueStateloop; case '\u0000': c = '\uFFFD'; // fall thru goto case '\"'; case '\"': case '\'': case '<': /* * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE * (') U+003C LESS-THAN SIGN (<) Parse error. */ ErrQuoteOrLtInAttributeNameOrNull(c); /* * Treat it as per the "anything else" entry * below. */ goto default; default: if (c >= 'A' && c <= 'Z') { /* * U+0041 LATIN CAPITAL LETTER A through to * U+005A LATIN CAPITAL LETTER Z Append the * lowercase TokenizerState.version of the current input * character (add 0x0020 to the character's * code point) to the current attribute's * name. */ c += (char)0x20; } /* * Anything else Append the current input * character to the current attribute's name. */ AppendStrBuf(c); /* * Stay in the attribute name state. */ continue; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakAttributenameloop: goto case TokenizerState.s37_BEFORE_ATTRIBUTE_VALUE; } // FALLTHRU DON'T REORDER case TokenizerState.s37_BEFORE_ATTRIBUTE_VALUE: /*beforeattributevalueloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay * in the before attribute value state. */ continue; case '"': /* * U+0022 QUOTATION MARK (") Switch to the * attribute value (double-quoted) state. */ ClearLongStrBuf(); //state = Transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos); state = TokenizerState.s38_ATTRIBUTE_VALUE_DOUBLE_QUOTED; goto breakBeforeattributevalueloop; // goto continueStateloop; case '&': /* * U+0026 AMPERSAND (&) Switch to the attribute * value (unquoted) state and reconsume this * input character. */ ClearLongStrBuf(); //state = Transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); state = TokenizerState.s40_ATTRIBUTE_VALUE_UNQUOTED; NoteUnquotedAttributeValue(); //reconsume = true; reader.StepBack(); goto continueStateloop; case '\'': /* * U+0027 APOSTROPHE (') Switch to the attribute * value (single-quoted) state. */ ClearLongStrBuf(); //state = Transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos); state = TokenizerState.s39_ATTRIBUTE_VALUE_SINGLE_QUOTED; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Parse error. */ ErrAttributeValueMissing(); /* * Emit the current tag token. */ AddAttributeWithoutValue(); //state = Transition(state, EmitCurrentTagToken(false, pos), reconsume, pos); state = EmitCurrentTagToken(false); if (shouldSuspend) { goto breakStateloop; } /* * Switch to the data state. */ goto continueStateloop; case '\u0000': c = '\uFFFD'; // fall thru goto case '<'; case '<': case '=': case '`': /* * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN * (=) U+0060 GRAVE ACCENT (`) */ ErrLtOrEqualsOrGraveInUnquotedAttributeOrNull(c); /* * Treat it as per the "anything else" entry * below. */ goto default; default: // [NOCPP[ ErrHtml4NonNameInUnquotedAttribute(c); // ]NOCPP] /* * Anything else Append the current input * character to the current attribute's value. */ ClearLongStrBufAndAppend(c); /* * Switch to the attribute value (unquoted) * state. */ //state = Transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); state = TokenizerState.s40_ATTRIBUTE_VALUE_UNQUOTED; NoteUnquotedAttributeValue(); goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakBeforeattributevalueloop: goto case TokenizerState.s38_ATTRIBUTE_VALUE_DOUBLE_QUOTED; } // FALLTHRU DON'T REORDER case TokenizerState.s38_ATTRIBUTE_VALUE_DOUBLE_QUOTED: /*attributevaluedoublequotedloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '"': /* * U+0022 QUOTATION MARK (") Switch to the after * attribute value (quoted) state. */ AddAttributeWithValue(); //state = Transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); state = TokenizerState.s42__AFTER_ATTRIBUTE_VALUE_QUOTED; goto breakAttributevaluedoublequotedloop; // goto continueStateloop; case '&': /* * U+0026 AMPERSAND (&) Switch to the character * reference in attribute value state, with the * additional allowed character being U+0022 * QUOTATION MARK ("). */ ClearStrBufAndAppend(c); SetAdditionalAndRememberAmpersandLocation('\"'); returnState = state; //state = Transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); state = TokenizerState.CONSUME_CHARACTER_REFERENCE; goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); continue; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: /* * Anything else Append the current input * character to the current attribute's value. */ AppendLongStrBuf(c); /* * Stay in the attribute value (double-quoted) * state. */ continue; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakAttributevaluedoublequotedloop: goto case TokenizerState.s42__AFTER_ATTRIBUTE_VALUE_QUOTED; } // FALLTHRU DON'T REORDER case TokenizerState.s42__AFTER_ATTRIBUTE_VALUE_QUOTED: /*afterattributevaluequotedloop:*/ { char c; while (reader.ReadNext(out c)) { switch (c) { case '\r': SilentCarriageReturn(); //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.s34_BEFORE_ATTRIBUTE_NAME; goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * Switch to the before attribute name state. */ //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.s34_BEFORE_ATTRIBUTE_NAME; goto continueStateloop; case '/': /* * U+002F SOLIDUS (/) Switch to the self-closing * start tag state. */ //state = Transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); state = TokenizerState.s43_SELF_CLOSING_START_TAG; goto breakAfterattributevaluequotedloop; // goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * tag token. */ //state = Transition(state, EmitCurrentTagToken(false, pos), reconsume, pos); state = EmitCurrentTagToken(false); if (shouldSuspend) { goto breakStateloop; } /* * Switch to the data state. */ goto continueStateloop; default: /* * Anything else Parse error. */ ErrNoSpaceBetweenAttributes(); /* * Reconsume the character in the before * attribute name state. */ //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.s34_BEFORE_ATTRIBUTE_NAME; //reconsume = true; reader.StepBack(); goto continueStateloop; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakAfterattributevaluequotedloop: goto case TokenizerState.s43_SELF_CLOSING_START_TAG; } // FALLTHRU DON'T REORDER case TokenizerState.s43_SELF_CLOSING_START_TAG: { char c; if (!reader.ReadNext(out c)) { goto breakStateloop; } //--------------------------------- /* * Consume the next input character: */ switch (c) { case '>': /* * U+003E GREATER-THAN SIGN (>) Set the self-closing * flag of the current tag token. Emit the current * tag token. */ // [NOCPP[ ErrHtml4XmlVoidSyntax(); // ]NOCPP] //state = Transition(state, EmitCurrentTagToken(true, pos), reconsume, pos); state = EmitCurrentTagToken(true); if (shouldSuspend) { goto breakStateloop; } /* * Switch to the data state. */ goto continueStateloop; default: /* Anything else Parse error. */ ErrSlashNotFollowedByGt(); /* * Reconsume the character in the before attribute * name state. */ //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.s34_BEFORE_ATTRIBUTE_NAME; reader.StepBack(); //reconsume = true; goto continueStateloop; } } // XXX reorder point case TokenizerState.s40_ATTRIBUTE_VALUE_UNQUOTED: { char c; while (reader.ReadNext(out c)) { switch (c) { case '\r': SilentCarriageReturn(); AddAttributeWithValue(); //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.s34_BEFORE_ATTRIBUTE_NAME; goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE * Switch to the before attribute name state. */ AddAttributeWithValue(); //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.s34_BEFORE_ATTRIBUTE_NAME; goto continueStateloop; case '&': /* * U+0026 AMPERSAND (&) Switch to the character * reference in attribute value state, with the * additional allowed character being U+003E * GREATER-THAN SIGN (>) */ ClearStrBufAndAppend(c); SetAdditionalAndRememberAmpersandLocation('>'); returnState = state; //state = Transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); state = TokenizerState.CONSUME_CHARACTER_REFERENCE; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * tag token. */ AddAttributeWithValue(); //state = Transition(state, EmitCurrentTagToken(false, pos), reconsume, pos); state = EmitCurrentTagToken(false); if (shouldSuspend) { goto breakStateloop; } /* * Switch to the data state. */ goto continueStateloop; case '\u0000': c = '\uFFFD'; goto case '<'; // fall thru case '<': case '\"': case '\'': case '=': case '`': /* * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error. */ ErrUnquotedAttributeValOrNull(c); /* * Treat it as per the "anything else" entry * below. */ // fall through goto default; default: // [NOCPP] ErrHtml4NonNameInUnquotedAttribute(c); // ]NOCPP] /* * Anything else Append the current input * character to the current attribute's value. */ AppendLongStrBuf(c); /* * Stay in the attribute value (unquoted) state. */ continue; } } //------------------------------- goto breakStateloop; } // XXX reorder point case TokenizerState.s36_AFTER_ATTRIBUTE_NAME: { char c; while (reader.ReadNext(out c)) { switch (c) { case '\r': SilentCarriageReturn(); goto breakStateloop; case '\n': case ' ': case '\t': case '\u000C': /* * U+0009 CHARACTER TABULATION U+000A LINE FEED * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay * in the after attribute name state. */ continue; case '/': /* * U+002F SOLIDUS (/) Switch to the self-closing * start tag state. */ AddAttributeWithoutValue(); //state = Transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); state = TokenizerState.s43_SELF_CLOSING_START_TAG; goto continueStateloop; case '=': /* * U+003D EQUALS SIGN (=) Switch to the before * attribute value state. */ //state = Transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); state = TokenizerState.s37_BEFORE_ATTRIBUTE_VALUE; goto continueStateloop; case '>': /* * U+003E GREATER-THAN SIGN (>) Emit the current * tag token. */ AddAttributeWithoutValue(); //state = Transition(state, EmitCurrentTagToken(false, pos), reconsume, pos); state = EmitCurrentTagToken(false); if (shouldSuspend) { goto breakStateloop; } /* * Switch to the data state. */ goto continueStateloop; case '\u0000': c = '\uFFFD'; goto case '\"'; // fall thru case '\"': case '\'': case '<': ErrQuoteOrLtInAttributeNameOrNull(c); /* * Treat it as per the "anything else" entry * below. */ goto default; default: AddAttributeWithoutValue(); /* * Anything else Start a new attribute in the * current tag token. */ if (c >= 'A' && c <= 'Z') { /* * U+0041 LATIN CAPITAL LETTER A through to * U+005A LATIN CAPITAL LETTER Z Set that * attribute's name to the lowercase TokenizerState.version * of the current input character (add * 0x0020 to the character's code point) */ c += (char)0x20; } /* * Set that attribute's name to the current * input character, */ ClearStrBufAndAppend(c); /* * and its value to the empty string. */ // Will do later. /* * Switch to the attribute name state. */ //state = Transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); state = TokenizerState.s35_ATTRIBUTE_NAME; goto continueStateloop; } } //------------------------------- //eof goto breakStateloop; } // XXX reorder point case TokenizerState.s39_ATTRIBUTE_VALUE_SINGLE_QUOTED: /*attributevaluesinglequotedloop:*/ { char c; while (reader.ReadNext(out c)) { /* * Consume the next input character: */ switch (c) { case '\'': /* * U+0027 APOSTROPHE (') Switch to the after * attribute value (quoted) state. */ AddAttributeWithValue(); //state = Transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); state = TokenizerState.s42__AFTER_ATTRIBUTE_VALUE_QUOTED; goto continueStateloop; case '&': /* * U+0026 AMPERSAND (&) Switch to the character * reference in attribute value state, with the * + additional allowed character being U+0027 * APOSTROPHE ('). */ ClearStrBufAndAppend(c); SetAdditionalAndRememberAmpersandLocation('\''); returnState = state; //state = Transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); state = TokenizerState.CONSUME_CHARACTER_REFERENCE; goto breakAttributevaluesinglequotedloop; // goto continueStateloop; case '\r': AppendLongStrBufCarriageReturn(); goto breakStateloop; case '\n': AppendLongStrBufLineFeed(); continue; case '\u0000': c = '\uFFFD'; goto default; // fall thru default: /* * Anything else Append the current input * character to the current attribute's value. */ AppendLongStrBuf(c); /* * Stay in the attribute value (double-quoted) * state. */ continue; } } //------------------------------------ //eof goto breakStateloop; //------------------------------------ breakAttributevaluesinglequotedloop: goto case TokenizerState.CONSUME_CHARACTER_REFERENCE; } // FALLTHRU DON'T REORDER case TokenizerState.CONSUME_CHARACTER_REFERENCE: { char c; if (!reader.ReadNext(out c)) { //------------------------------------ //eof goto breakStateloop; } if (c == '\u0000') { goto breakStateloop; } /* * Unlike the definition is the spec, this state does not * return a value and never requires the caller to * backtrack. This state takes care of emitting characters * or appending to the current attribute value. It also * takes care of that in the case TokenizerState.when consuming the * character reference fails. */ /* * This section defines how to consume a character * reference. This definition is used when parsing character * references in text and in attributes. * * The behavior depends on the identity of the next * character (the one immediately after the U+0026 AMPERSAND * character): */ switch (c) { case ' ': case '\t': case '\n': case '\r': // we'll reconsume! case '\u000C': case '<': case '&': EmitOrAppendStrBuf(returnState); //if ((returnState & DATA_AND_RCDATA_MASK) == 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) != 0) { reader.StartCollect(); } //state = Transition(state, returnState, reconsume, pos); state = returnState; //reconsume = true; reader.StepBack(); goto continueStateloop; case '#': /* * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER * SIGN. */ AppendStrBuf('#'); //state = Transition(state, Tokenizer.CONSUME_NCR, reconsume, pos); state = TokenizerState.CONSUME_NCR; goto continueStateloop; default: if (c == additional) { EmitOrAppendStrBuf(returnState); //state = Transition(state, returnState, reconsume, pos); state = returnState; //reconsume = true; reader.StepBack(); goto continueStateloop; } if (c >= 'a' && c <= 'z') { firstCharKey = c - 'a' + 26; } else if (c >= 'A' && c <= 'Z') { firstCharKey = c - 'A'; } else { // No match /* * If no match can be made, then this is a parse * error. */ ErrNoNamedCharacterMatch(); EmitOrAppendStrBuf(returnState); //if ((returnState & DATA_AND_RCDATA_MASK) == 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) != 0) { reader.StartCollect(); } //state = Transition(state, returnState, reconsume, pos); state = returnState; //reconsume = true; reader.StepBack(); goto continueStateloop; } // Didn't fail yet AppendStrBuf(c); //state = Transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos); state = TokenizerState.CHARACTER_REFERENCE_HILO_LOOKUP; // FALL THROUGH goto continueStateloop; break; } //------------------------------------ goto case TokenizerState.CHARACTER_REFERENCE_HILO_LOOKUP; } // WARNING FALLTHRU case TokenizerState.TRANSITION: DON'T REORDER case TokenizerState.CHARACTER_REFERENCE_HILO_LOOKUP: { char c; if (reader.ReadNext(out c)) { //------------------------------------ //eof goto breakStateloop; } if (c == '\u0000') { goto breakStateloop; } /* * The data structure is as follows: * * HILO_ACCEL is a two-dimensional int array whose major * index corresponds to the second character of the * character reference (code point as index) and the * minor index corresponds to the first character of the * character reference (packed so that A-Z runs from 0 * to 25 and a-z runs from 26 to 51). This layout makes * it easier to use the sparseness of the data structure * to omit parts of it: The second dimension of the * table is null when no character reference starts with * the character corresponding to that row. * * The int value HILO_ACCEL (by these indeces) is zero * if there exists no character reference starting with * that two-letter prefix. Otherwise, the value is an * int that packs two shorts so that the higher short is * the index of the highest character reference name * with that prefix in NAMES and the lower short * corresponds to the index of the lowest character * reference name with that prefix. (It happens that the * first two character reference names share their * prefix so the packed int cannot be 0 by packing the * two shorts.) * * NAMES is an array of byte arrays where each byte * array encodes the name of a character references as * ASCII. The names omit the first two letters of the * name. (Since storing the first two letters would be * redundant with the data contained in HILO_ACCEL.) The * entries are lexically sorted. * * For a given index in NAMES, the same index in VALUES * contains the corresponding expansion as an array of * two UTF-16 code units (either the character and * U+0000 or a suggogate pair). */ int hilo = 0; if (c <= 'z') { int[] row = NamedCharactersAccel.HILO_ACCEL[c]; if (row != null) { hilo = row[firstCharKey]; } } if (hilo == 0) { /* * If no match can be made, then this is a parse * error. */ ErrNoNamedCharacterMatch(); EmitOrAppendStrBuf(returnState); //if ((returnState & DATA_AND_RCDATA_MASK) == 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) != 0) { reader.StartCollect(); } //state = Transition(state, returnState, reconsume, pos); state = returnState; //reconsume = true; reader.StepBack(); goto continueStateloop; } // Didn't fail yet AppendStrBuf(c); lo = hilo & 0xFFFF; hi = hilo >> 16; entCol = -1; candidate = -1; strBufMark = 0; //state = Transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos); state = TokenizerState.CHARACTER_REFERENCE_TAIL; // FALL THROUGH goto continueStateloop; goto case TokenizerState.CHARACTER_REFERENCE_TAIL; } case TokenizerState.CHARACTER_REFERENCE_TAIL: /*outer:*/ { char c; while (reader.ReadNext(out c)) { if (c == '\u0000') { goto breakStateloop; } entCol++; /* * Consume the maximum number of characters possible, * with the consumed characters matching one of the * identifiers in the first column of the named * character references table (in a case-sensitive * manner). */ /*loloop:*/ for (; ; ) { if (hi < lo) { goto breakOuter; } if (entCol == NamedCharacters.NAMES[lo].Length) { candidate = lo; strBufMark = this.strBuffer.Length; lo++; } else if (entCol > NamedCharacters.NAMES[lo].Length) { goto breakOuter; } else if (c > NamedCharacters.NAMES[lo][entCol]) { lo++; } else { goto breakLoloop; } } breakLoloop: /*hiloop:*/ for (; ; ) { if (hi < lo) { goto breakOuter; } if (entCol == NamedCharacters.NAMES[hi].Length) { goto breakHiloop; } if (entCol > NamedCharacters.NAMES[hi].Length) { goto breakOuter; } else if (c < NamedCharacters.NAMES[hi][entCol]) { hi--; } else { goto breakHiloop; } } breakHiloop: if (hi < lo) { goto breakOuter; } AppendStrBuf(c); continue; } breakOuter: if (candidate == -1) { // reconsume deals with CR, LF or nul /* * If no match can be made, then this is a parse error. */ ErrNoNamedCharacterMatch(); EmitOrAppendStrBuf(returnState); //if ((returnState & DATA_AND_RCDATA_MASK) == 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) != 0) { reader.StartCollect(); } //state = Transition(state, returnState, reconsume, pos); state = returnState; //reconsume = true; reader.StepBack(); goto continueStateloop; } else { // c can't be CR, LF or nul if we got here string candidateName = NamedCharacters.NAMES[candidate]; if (candidateName.Length == 0 || candidateName[candidateName.Length - 1] != ';') { /* * If the last character matched is not a U+003B * SEMICOLON (;), there is a parse error. */ //if ((returnState & DATA_AND_RCDATA_MASK) != 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) == 0) { /* * If the entity is being consumed as part of an * attribute, and the last character matched is * not a U+003B SEMICOLON (;), */ char ch; if (strBufMark == this.strBuffer.Length) { ch = c; } else { // if (strBufOffset != -1) { // ch = buf[strBufOffset + strBufMark]; // } else { ch = this.strBuffer[strBufMark]; // } } if (ch == '=' || (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')) { /* * and the next character is either a U+003D * EQUALS SIGN character (=) or in the range * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, * U+0041 LATIN CAPITAL LETTER A to U+005A * LATIN CAPITAL LETTER Z, or U+0061 LATIN * SMALL LETTER A to U+007A LATIN SMALL * LETTER Z, then, for historical reasons, * all the characters that were matched * after the U+0026 AMPERSAND (&) must be * unconsumed, and nothing is returned. */ ErrNoNamedCharacterMatch(); AppendStrBufToLongStrBuf(); //state = Transition(state, returnState, reconsume, pos); state = returnState; //reconsume = true; reader.StepBack(); goto continueStateloop; } } //if ((returnState & DATA_AND_RCDATA_MASK) != 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) == 0) { ErrUnescapedAmpersandInterpretedAsCharacterReference(); } else { ErrNotSemicolonTerminated(); } } /* * Otherwise, return a character token for the character * corresponding to the entity name (as given by the * second column of the named character references * table). */ char[] val = NamedCharacters.VALUES[candidate]; if (val.Length == 1) { EmitOrAppendOne(val, returnState); } else { EmitOrAppendTwo(val, returnState); } // this is so complicated! if (strBufMark < this.strBuffer.Length) { // if (strBufOffset != -1) { // if ((returnState & (~1)) != 0) { // for (int i = strBufMark; i < strBufLen; i++) { // appendLongStrBuf(buf[strBufOffset + i]); // } // } else { // tokenHandler.Characters(buf, strBufOffset // + strBufMark, strBufLen // - strBufMark); // } // } else { //if ((returnState & DATA_AND_RCDATA_MASK) != 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) == 0) { int j = this.strBuffer.Length; for (int i = strBufMark; i < j; i++) { AppendLongStrBuf(strBuffer[i]); } } else { TokenListener.Characters(CopyFromStringBuiler(this.strBuffer, strBufMark, this.strBuffer.Length - strBufMark)); } // } } //if ((returnState & DATA_AND_RCDATA_MASK) == 0) if (((byte)returnState & DATA_AND_RCDATA_MASK) != 0) { reader.StartCollect(); } //state = Transition(state, returnState, reconsume, pos); state = returnState; //reconsume = true; reader.StepBack(); goto continueStateloop; /* * If the markup contains I'm ¬it; I tell you, the * entity is parsed as "not", as in, I'm ¬it; I tell * you. But if the markup was I'm ∉ I tell you, * the entity would be parsed as "notin;", resulting in * I'm ∉ I tell you. */ } } // XXX reorder point case TokenizerState.s09_CLOSE_TAG_OPEN: { char c; if (!reader.ReadNext(out c)) { //------------------------------------ //eof goto breakStateloop; } /* * Otherwise, if the content model flag is set to the PCDATA * state, or if the next few characters do match that tag * name, consume the next input character: */ switch (c) { case '>': /* U+003E GREATER-THAN SIGN (>) Parse error. */ ErrLtSlashGt(); /* * Switch to the data state. */ reader.SkipOneAndStartCollect(); //state = Transition(state, Tokenizer.DATA, reconsume, pos); state = TokenizerState.s01_DATA; goto continueStateloop; case '\r': SilentCarriageReturn(); /* Anything else Parse error. */ ErrGarbageAfterLtSlash(); /* * Switch to the bogus comment state. */ ClearLongStrBufAndAppend('\n'); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.s44_BOGUS_COMMENT; goto breakStateloop; case '\n': /* Anything else Parse error. */ ErrGarbageAfterLtSlash(); /* * Switch to the bogus comment state. */ ClearLongStrBufAndAppend('\n'); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.s44_BOGUS_COMMENT; goto continueStateloop; case '\u0000': c = '\uFFFD'; // fall thru goto default; default: if (c >= 'A' && c <= 'Z') { c += (char)0x20; } if (c >= 'a' && c <= 'z') { /* * U+0061 LATIN SMALL LETTER A through to U+007A * LATIN SMALL LETTER Z Create a new end tag * token, */ endTag = true; /* * set its tag name to the input character, */ ClearStrBufAndAppend(c); /* * then switch to the tag name state. (Don't * emit the token yet; further details will be * filled in before it is emitted.) */ //state = Transition(state, Tokenizer.TAG_NAME, reconsume, pos); state = TokenizerState.s10_TAG_NAME; goto continueStateloop; } else { /* Anything else Parse error. */ ErrGarbageAfterLtSlash(); /* * Switch to the bogus comment state. */ ClearLongStrBufAndAppend(c); //state = Transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); state = TokenizerState.s44_BOGUS_COMMENT; goto continueStateloop; } } } // END HOTSPOT WORKAROUND } } // stateloop breakStateloop: //FlushChars(buf, pos); FlushChars(); /* * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; } */ // Save locals stateSave = state; returnStateSave = returnState; }
/// <summary> /// Close out this element. This method will return true if something can be yielded; this this /// means it's got a parent at the top of the heirarchy. Otherwise it's just closed but false is /// returned. /// </summary> /// /// <param name="factory"> /// The HTML factory to operate against. /// </param> /// /// <returns> /// An enumerator that allows foreach to be used to process close element in this collection. /// </returns> public IEnumerable<IDomObject> CloseElement(HtmlElementFactory factory) { IDomObject element = null; if (TryGetLiteral(factory, out element)) { yield return element; } if (Parent != null) { if (Parent.Parent == null) { yield return Parent.Element; } Parent.Reset(Pos); TokenizerState = TokenizerState.Finished; } }
public List <Token> Tokenize(string sourceCode) { _tokens = new List <Token>(); _currentToken = new Token(TokenType.Unknown, string.Empty); _currentTokenizerState = TokenizerState.Default; foreach (var character in sourceCode) { ReprocessCharacter: // ToDo: Get rid of goto to reprocess characters switch (_currentTokenizerState) { case TokenizerState.Default: if (SingleCharTokens.ContainsKey(character)) { _tokens.Add(new Token(SingleCharTokens[character], character.ToString())); } else if (char.IsLetter(character)) { StartToken(TokenizerState.Word, character.ToString()); } else if (char.IsDigit(character)) { StartToken(TokenizerState.Number, character.ToString()); } else if (character.Equals('"')) { StartToken(TokenizerState.String); } else if (character.Equals('\'')) { StartToken(TokenizerState.Comment); } break; case TokenizerState.Word: if (char.IsLetterOrDigit(character)) { _currentToken.Text += character; } else if (character.Equals(':')) { EndToken(TokenType.Label); } else { EndToken(TokenType.Word); goto ReprocessCharacter; } break; case TokenizerState.Number: // ToDo: support negative numbers and floating point numbers. if (char.IsDigit(character)) { _currentToken.Text += character; } else { EndToken(TokenType.Number); goto ReprocessCharacter; } break; case TokenizerState.String: if (character.Equals('"')) { EndToken(TokenType.String); } else { _currentToken.Text += character; } break; case TokenizerState.Comment: if (character.Equals('\n')) { EndToken(TokenType.Comment); } break; default: throw new Exception($"Unknown tokenizer state reached."); } } return(_tokens); }
public void Tokenize() { while (_remainingText != string.Empty) { switch (_state) { case TokenizerState.ReadingText: var textMatch = _rawTextRegex.Match(_remainingText); if (textMatch.Success) { string rawText = textMatch.Groups["text"].ToString(); if (rawText != string.Empty) { var rawTextToken = new Token(TokenType.RawText, rawText); _tokens.Add(rawTextToken); } _tokens.Add(new Token(TokenType.InstructionBegin, "<$")); _remainingText = _remainingText.Remove(0, textMatch.Length); _state = TokenizerState.ReadingInstruction; } else { var rawTextToken = new Token(TokenType.RawText, _remainingText); _tokens.Add(rawTextToken); _remainingText = string.Empty; return; } break; case TokenizerState.ReadingInstruction: var instructionMatch = _varInstructionRegex.Match(_remainingText); if (instructionMatch.Success) { string variableName = instructionMatch.Groups["name"].ToString(); _tokens.Add(new Token(TokenType.Variable, "var:")); _tokens.Add(new Token(TokenType.Identifier, variableName)); _tokens.Add(new Token(TokenType.InstructionEnd, ">")); _remainingText = _remainingText.Remove(0, instructionMatch.Length); _state = TokenizerState.ReadingText; continue; } instructionMatch = _subtemplateInstructionRegex.Match(_remainingText); if (instructionMatch.Success) { string templateName = instructionMatch.Groups["name"].ToString(); _tokens.Add(new Token(TokenType.SubtemplateBegin, "subtemplate:")); _tokens.Add(new Token(TokenType.Identifier, templateName)); _tokens.Add(new Token(TokenType.InstructionEnd, ">")); _remainingText = _remainingText.Remove(0, instructionMatch.Length); _state = TokenizerState.ReadingText; continue; } instructionMatch = _endSubtemplateInstructionRegex.Match(_remainingText); if (instructionMatch.Success) { _tokens.Add(new Token(TokenType.SubtemplateEnd, "endsubtemplate")); _tokens.Add(new Token(TokenType.InstructionEnd, ">")); _remainingText = _remainingText.Remove(0, instructionMatch.Length); _state = TokenizerState.ReadingText; continue; } throw new InvalidOperationException("Unable to understand the remaining text while parsing it, because some '<$...>' instruction was expected. " + $"Remaining text began with this instead: '{string.Concat(_remainingText.Take(20))}'."); } } }
internal static Action<Stack<TokenizerState>> PushState(TokenizerState state) { return states => states.Push(state); }
IEnumerator <string> IEnumerable <string> .GetEnumerator() { StringBuilder currentToken = new StringBuilder(); TokenizerState state = TokenizerState.WhiteSpace; int parenthesisCount = 0; bool escapeQuote = false; for (int i = 0; i < original.Length; i++) { char ch = original[i]; switch (state) { case TokenizerState.WhiteSpace: if (ch == '\'') { state = TokenizerState.Quoted; currentToken.Append(ch); } else if (ch == ',') { yield return(","); } else if (ch == '(' || ch == '[') { state = TokenizerState.InParenthesis; currentToken.Append(ch); parenthesisCount = 1; } else if (char.IsWhiteSpace(ch) == false) { state = TokenizerState.Token; currentToken.Append(ch); } break; case TokenizerState.Quoted: if (escapeQuote) { escapeQuote = false; currentToken.Append(ch); } // handle escaping of ' by using '' or \' else if (ch == '\\' || (ch == '\'' && i + 1 < original.Length && original[i + 1] == '\'')) { escapeQuote = true; currentToken.Append(ch); } else if (ch == '\'') { currentToken.Append(ch); yield return(currentToken.ToString()); state = TokenizerState.WhiteSpace; currentToken.Length = 0; } else { currentToken.Append(ch); } break; case TokenizerState.InParenthesis: if (ch == ')' || ch == ']') { currentToken.Append(ch); parenthesisCount -= 1; if (parenthesisCount == 0) { yield return(currentToken.ToString()); currentToken.Length = 0; state = TokenizerState.WhiteSpace; } } else if (ch == '(' || ch == '[') { currentToken.Append(ch); parenthesisCount += 1; } else { currentToken.Append(ch); } break; case TokenizerState.Token: if (char.IsWhiteSpace(ch)) { yield return(currentToken.ToString()); currentToken.Length = 0; state = TokenizerState.WhiteSpace; } else if (ch == ',') // stop current token, and send the , as well { yield return(currentToken.ToString()); currentToken.Length = 0; yield return(","); state = TokenizerState.WhiteSpace; } else if (ch == '(' || ch == '[') { state = TokenizerState.InParenthesis; parenthesisCount = 1; currentToken.Append(ch); } else if (ch == '\'') { state = TokenizerState.Quoted; currentToken.Append(ch); } else { currentToken.Append(ch); } break; default: throw new InvalidExpressionException("Could not understand the string " + original); } } if (currentToken.Length > 0) { yield return(currentToken.ToString()); } }
internal static Action<Stack<TokenizerState>> ReplaceState(TokenizerState state) { return ReplaceState(state, 1); }
public CsvToken GetToken() { if (disposed) { throw new ObjectDisposedException("tokenizer"); } StringBuilder text = null; while (true) { switch (state) { case TokenizerState.Unknown: { input = reader.Read(); if (EndOfStream != input) { TextPosition = TextPosition.Begin(); //text = new StringBuilder(); state = TokenizerState.Reading; break; } state = TokenizerState.EndOfDocument; break; } case TokenizerState.FlushLastToken: { if (null != text && 0 < text.Length) { state = TokenizerState.EndOfDocument; return(CsvToken.String(text.ToString())); } state = TokenizerState.Failed; break; } case TokenizerState.EndOfDocument: { return(CsvToken.End); } case TokenizerState.Reading: { if (EndOfStream == input) { state = null == text ? TokenizerState.EndOfDocument : TokenizerState.FlushLastToken; break; } if (null == text) { //TextPosition = TextPosition.Begin(); text = new StringBuilder(); } var current = (char)input; if (IsTerm(current)) { if (0 < text.Length) { return(CsvToken.String(text.ToString())); } input = reader.Read(); return(CsvToken.Terminal(current)); } text.Append(current); input = reader.Read(); break; } default: { throw new Exception(); } } } }
public IEnumerable<Token> GetTokens() { if (this.position >= this.value.Length) { yield break; } int readCount = 0; bool readCompleted = false; string errorMessage = null; while (!readCompleted) { switch (this.currentState) { case TokenizerState.ReadyToReadKey: { if (this.position >= this.value.Length) { errorMessage = "Unexpected string end in '{0}' state.".FormatInvariant(this.currentState); this.currentState = TokenizerState.Error; break; } char currentChar = this.value[this.position]; switch (currentChar) { case '=': case '&': errorMessage = "Unexpected character '{0}' in '{1}' state.".FormatInvariant(currentChar, this.currentState); this.currentState = TokenizerState.Error; break; case '/': this.currentState = TokenizerState.Finish; break; default: readCount++; this.currentState = TokenizerState.ReadKey; break; } break; } case TokenizerState.ReadKey: { if (this.position >= this.value.Length) { yield return this.CreateToken(TokenType.Key, readCount); yield return this.CreateToken(TokenType.Value, 0); readCount = 0; this.currentState = TokenizerState.Finish; break; } char currentChar = this.value[this.position]; switch (currentChar) { case '=': yield return this.CreateToken(TokenType.Key, readCount); readCount = 0; this.currentState = TokenizerState.ReadValue; break; case '&': yield return this.CreateToken(TokenType.Key, readCount); yield return this.CreateToken(TokenType.Value, 0); readCount = 0; this.currentState = TokenizerState.ReadyToReadKey; break; case '/': yield return this.CreateToken(TokenType.Key, readCount); yield return this.CreateToken(TokenType.Value, 0); readCount = 0; this.currentState = TokenizerState.Finish; break; default: readCount++; //this.currentState = TokenizerState.ReadKey; break; } break; } case TokenizerState.ReadValue: { if (this.position >= this.value.Length) { yield return this.CreateToken(TokenType.Value, readCount); readCount = 0; this.currentState = TokenizerState.Finish; break; } char currentChar = this.value[this.position]; switch (currentChar) { case '=': errorMessage = "Unexpected character '{0}' in '{1}' state.".FormatInvariant(currentChar, this.currentState); this.currentState = TokenizerState.Error; break; case '&': yield return this.CreateToken(TokenType.Value, readCount); readCount = 0; this.currentState = TokenizerState.ReadyToReadKey; break; case '/': yield return this.CreateToken(TokenType.Value, readCount); readCount = 0; this.currentState = TokenizerState.Finish; break; default: readCount++; //this.currentState = TokenizerState.ReadValue; break; } break; } case TokenizerState.Finish: case TokenizerState.Error: readCompleted = true; break; default: throw new NotSupportedException(); } this.position++; } if (this.currentState == TokenizerState.Error) { throw new FormatException(errorMessage); } }