public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader) { for (;;) { int c = Read(reader); switch (c) { case '&': CharacterReferenceInDataState.Instance.Process(tokenizer, reader, null); break; case '<': return TagOpenState.Instance; case '\0': ReportParseError(); tokenizer.EmitChar('\0'); break; case -1: tokenizer.EmitToken(new EndOfFileToken()); return null; default: tokenizer.EmitChar((char)c); return this; // Required to allow switching the state. } } }
public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader) { int c = Read(reader); switch (c) { case '&': return CharacterReferenceInRCDATAState.Instance; case '<': return RCDATALessThanSignState.Instance; case 0: ReportParseError(); tokenizer.EmitChar('\uFFFD'); return this; case -1: tokenizer.EmitToken(new EndOfFileToken()); return this; default: tokenizer.EmitChar((char)c); return this; } }
public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader) { int c = Read(reader); if (base.IsUppercaseAsciiLetter(c)) { EndTagToken token = new EndTagToken(){ TagName = Char.ToLower((char)c).ToString() }; //Create a new end tag token, and set its tag name to the lowercase version of the current input //character (add 0x0020 to the character's code point). //Append the current input character to the temporary buffer. (http://www.w3.org/TR/html5/syntax.html#temporary-buffer) //Finally, switch to the RCDATA end tag name state. (Don't emit the token yet; //further details will be filled in before it is emitted.) tokenizer.TemporaryBuffer.Add((char)c); RCDATAEndTagNameState.Instance.Token = token; return RCDATAEndTagNameState.Instance; } if (base.IsLowercaseAsciiLetter(c)) { EndTagToken token = new EndTagToken(){ TagName = ((char)c).ToString() }; //Create a new end tag token, and set its tag name to the current input character. //Append the current input character to the temporary buffer. (http://www.w3.org/TR/html5/syntax.html#temporary-buffer) //Finally, switch to the RCDATA end tag name state. (Don't emit the token yet; //further details will be filled in before it is emitted.) tokenizer.TemporaryBuffer.Add((char)c); RCDATAEndTagNameState.Instance.Token = token; return RCDATAEndTagNameState.Instance; } tokenizer.EmitChar('<'); tokenizer.EmitChar('/'); LastConsumedCharacters.Enqueue((char)c); return RCDATAState.Instance; }
public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader) { int c = Read(reader); if (IsWhitespace(c)) { if (tokenizer.IsAppropriateEndTagToken(Token)) { return BeforeAttributeNameState.Instance; } } else if (c == '/') { if (tokenizer.IsAppropriateEndTagToken(Token)) { return SelfClosingStartTagState.Instance; } } else if (c == '>') { if (tokenizer.IsAppropriateEndTagToken(Token)) { tokenizer.EmitToken(Token); // TODO - is this the right token to emit? return DataState.Instance; } } else if (IsUppercaseAsciiLetter(c)) { Token.TagName += Char.ToLower((char)c); tokenizer.TemporaryBuffer.Add((char)c); return this; } else if (IsLowercaseAsciiLetter(c)) { Token.TagName += (char)c; tokenizer.TemporaryBuffer.Add((char)c); return this; } tokenizer.EmitChar('<'); tokenizer.EmitChar('/'); foreach (char bc in tokenizer.TemporaryBuffer) { tokenizer.EmitChar(bc); } RCDATAState.Instance.LastConsumedCharacters.Enqueue((char)c); return RCDATAState.Instance; }
public override BaseState Process(HtmlTokenizer tokenizer, System.IO.StreamReader reader) { for (;;){ int c = Read(reader); switch(c) { case 0: ReportParseError(); tokenizer.EmitChar('\uFFFD'); break; case -1: tokenizer.EmitToken(new EndOfFileToken()); break; default: tokenizer.EmitChar((char)c); return this; } } }
public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader) { int c = Read(reader); if (c == '/') { tokenizer.TemporaryBuffer.Clear(); return RCDATAEndTagOpenState.Instance; } tokenizer.EmitChar('<'); LastConsumedCharacters.Enqueue((char)c); return RCDATAState.Instance; }
public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader) { char c = (char)Read(reader); if (c == '!') { return MarkupDeclarationOpenState.Instance; } if (c == '/') { return EndTagOpenState.Instance; } if (base.IsUppercaseAsciiLetter(c)) { StartTagToken token = new StartTagToken(); token.TagName = Char.ToLower(c).ToString(); TagNameState.Instance.Token = token; return TagNameState.Instance; } if (base.IsLowercaseAsciiLetter(c)) { StartTagToken token = new StartTagToken(); token.TagName = ((char)c).ToString(); TagNameState.Instance.Token = token; return TagNameState.Instance; } if (c == '?') { ReportParseError(); return BogusCommentState.Instance; } ReportParseError(); tokenizer.EmitChar(c); DataState.Instance.LastConsumedCharacters.Enqueue(c); return DataState.Instance; }
public BaseState Process(HtmlTokenizer tokenizer, StreamReader reader, char? additionalAllowedCharacter) { // Switch to the data state. // Attempt to consume a character reference, with no additional allowed character. // (http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references) // (http://www.w3.org/TR/html5/syntax.html#additional-allowed-character) // // If nothing is returned, emit a U+0026 AMPERSAND character (&) token. // Otherwise, emit the character tokens that were returned. int c = Peek(reader); if (c == 9 || c == 0x0A || c == 0x0C || c == ' ' || c == '<' || c == -1 || additionalAllowedCharacter.HasValue && c == additionalAllowedCharacter.Value) { } else if (c == '#') { Read(reader); int nc = Peek(reader); uint? val = null; if (nc == 'X' || nc == 'x') { Read(reader); val = ConsumeHexDigits(reader); // http://www.w3.org/TR/html5/infrastructure.html#ascii-hex-digits } else { val = ConsumeDigits(reader); } if (val.HasValue) { char parsedChar = GetCharFromNumericValue(val.Value); tokenizer.EmitChar(parsedChar); } else { tokenizer.EmitChar('&'); } } else { // Consume the maximum number of characters possible, with the consumed characters matching one of the identifiers in the first column of the named character references table (in a case-sensitive manner). // If no match can be made, then no characters are consumed, and nothing is returned. In this case, if the characters after the U+0026 AMPERSAND character (&) consist of a sequence of one or more alphanumeric ASCII characters followed by a U+003B SEMICOLON character (;), then this is a parse error. // If the character reference is being consumed as part of an attribute, and the last character matched is not a ";" (U+003B) character, and the next character is either a "=" (U+003D) character or an alphanumeric ASCII character, then, for historical reasons, all the characters that were matched after the U+0026 AMPERSAND character (&) must be unconsumed, and nothing is returned. However, if this next character is in fact a "=" (U+003D) character, then this is a parse error, because some legacy user agents will misinterpret the markup in those cases. // Otherwise, a character reference is parsed. If the last character matched is not a ";" (U+003B) character, there is a parse error. // Return one or two character tokens for the character(s) corresponding to the character reference name (as given by the second column of the named character references table). // // Code Example: // If the markup contains (not in an attribute) the string "I'm ¬it; I tell you", the character reference is // parsed as "not", as in, "I'm ¬it; I tell you" (and this is a parse error). But if the markup was // "I'm ∉ I tell you", the character reference would be parsed as "notin;", resulting in "I'm ∉ I tell you" // (and no parse error). } return DataState.Instance; }