Example #1
0
        public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
        {
            for (;;)
            {
                int c = Read(reader);
                switch (c)
                {
                    case '&':
                        CharacterReferenceInDataState.Instance.Process(tokenizer, reader, null);
                        break;

                    case '<':
                        return TagOpenState.Instance;

                    case '\0':
                        ReportParseError();
                        tokenizer.EmitChar('\0');
                        break;

                    case -1:
                        tokenizer.EmitToken(new EndOfFileToken());
                        return null;

                    default:
                        tokenizer.EmitChar((char)c);
                        return this; // Required to allow switching the state.
                }
            }
        }
Example #2
0
        public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
        {
            int c = Read(reader);
            switch (c)
            {
                case '&':
                    return CharacterReferenceInRCDATAState.Instance;

                case '<':
                    return RCDATALessThanSignState.Instance;

                case 0:
                    ReportParseError();
                    tokenizer.EmitChar('\uFFFD');
                    return this;

                case -1:
                    tokenizer.EmitToken(new EndOfFileToken());
                    return this;

                default:
                    tokenizer.EmitChar((char)c);
                    return this;
            }
        }
 public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
 {
     int c = Read(reader);
     if (base.IsUppercaseAsciiLetter(c))
     {
         EndTagToken token = new EndTagToken(){ TagName = Char.ToLower((char)c).ToString() };
         //Create a new end tag token, and set its tag name to the lowercase version of the current input
         //character (add 0x0020 to the character's code point).
         //Append the current input character to the temporary buffer. (http://www.w3.org/TR/html5/syntax.html#temporary-buffer)
         //Finally, switch to the RCDATA end tag name state. (Don't emit the token yet;
         //further details will be filled in before it is emitted.)
         tokenizer.TemporaryBuffer.Add((char)c);
         RCDATAEndTagNameState.Instance.Token = token;
         return RCDATAEndTagNameState.Instance;
     }
     if (base.IsLowercaseAsciiLetter(c))
     {
         EndTagToken token = new EndTagToken(){ TagName = ((char)c).ToString() };
         //Create a new end tag token, and set its tag name to the current input character.
         //Append the current input character to the temporary buffer. (http://www.w3.org/TR/html5/syntax.html#temporary-buffer)
         //Finally, switch to the RCDATA end tag name state. (Don't emit the token yet;
         //further details will be filled in before it is emitted.)
         tokenizer.TemporaryBuffer.Add((char)c);
         RCDATAEndTagNameState.Instance.Token = token;
         return RCDATAEndTagNameState.Instance;
     }
     tokenizer.EmitChar('<');
     tokenizer.EmitChar('/');
     LastConsumedCharacters.Enqueue((char)c);
     return RCDATAState.Instance;
 }
        public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
        {
            int c = Read(reader);
            if (IsWhitespace(c))
            {
                if (tokenizer.IsAppropriateEndTagToken(Token))
                {
                    return BeforeAttributeNameState.Instance;
                }
            }
            else if (c == '/')
            {
                if (tokenizer.IsAppropriateEndTagToken(Token))
                {
                    return SelfClosingStartTagState.Instance;
                }
            }
            else if (c == '>')
            {
                if (tokenizer.IsAppropriateEndTagToken(Token))
                {
                    tokenizer.EmitToken(Token); // TODO - is this the right token to emit?
                    return DataState.Instance;
                }
            }
            else if (IsUppercaseAsciiLetter(c))
            {
                Token.TagName += Char.ToLower((char)c);
                tokenizer.TemporaryBuffer.Add((char)c);
                return this;
            }
            else if (IsLowercaseAsciiLetter(c))
            {
                Token.TagName += (char)c;
                tokenizer.TemporaryBuffer.Add((char)c);
                return this;
            }

            tokenizer.EmitChar('<');
            tokenizer.EmitChar('/');
            foreach (char bc in tokenizer.TemporaryBuffer)
            {
                tokenizer.EmitChar(bc);
            }
            RCDATAState.Instance.LastConsumedCharacters.Enqueue((char)c);
            return RCDATAState.Instance;
        }
Example #5
0
 public override BaseState Process(HtmlTokenizer tokenizer, System.IO.StreamReader reader)
 {
     for (;;){
         int c = Read(reader);
         switch(c)
         {
             case 0:
                 ReportParseError();
                 tokenizer.EmitChar('\uFFFD');
                 break;
             case -1:
                 tokenizer.EmitToken(new EndOfFileToken());
                 break;
             default:
                 tokenizer.EmitChar((char)c);
                 return this;
         }
     }
 }
 public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
 {
     int c = Read(reader);
     if (c == '/')
     {
         tokenizer.TemporaryBuffer.Clear();
         return RCDATAEndTagOpenState.Instance;
     }
     tokenizer.EmitChar('<');
     LastConsumedCharacters.Enqueue((char)c);
     return RCDATAState.Instance;
 }
Example #7
0
        public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
        {
            char c = (char)Read(reader);
            if (c == '!')
            {
                return MarkupDeclarationOpenState.Instance;
            }

            if (c == '/')
            {
                return EndTagOpenState.Instance;
            }

            if (base.IsUppercaseAsciiLetter(c))
            {
                StartTagToken token = new StartTagToken();
                token.TagName = Char.ToLower(c).ToString();
                TagNameState.Instance.Token = token;
                return TagNameState.Instance;
            }

            if (base.IsLowercaseAsciiLetter(c))
            {
                StartTagToken token = new StartTagToken();
                token.TagName = ((char)c).ToString();
                TagNameState.Instance.Token = token;
                return TagNameState.Instance;
            }

            if (c == '?')
            {
                ReportParseError();
                return BogusCommentState.Instance;
            }

            ReportParseError();
            tokenizer.EmitChar(c);
            DataState.Instance.LastConsumedCharacters.Enqueue(c);
            return DataState.Instance;
        }
        public BaseState Process(HtmlTokenizer tokenizer, StreamReader reader, char? additionalAllowedCharacter)
        {
            // Switch to the data state.
            // Attempt to consume a character reference, with no additional allowed character.
            // (http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references)
            // (http://www.w3.org/TR/html5/syntax.html#additional-allowed-character)
            //
            // If nothing is returned, emit a U+0026 AMPERSAND character (&) token.
            // Otherwise, emit the character tokens that were returned.

            int c = Peek(reader);

            if (c == 9 || c == 0x0A || c == 0x0C || c == ' ' || c == '<' || c == -1 ||
                additionalAllowedCharacter.HasValue && c == additionalAllowedCharacter.Value)
            {

            } else if (c == '#')
            {
                Read(reader);
                int nc = Peek(reader);
                uint? val = null;
                if (nc == 'X' || nc == 'x')
                {
                    Read(reader);
                    val = ConsumeHexDigits(reader);
                    // http://www.w3.org/TR/html5/infrastructure.html#ascii-hex-digits
                } else
                {
                    val = ConsumeDigits(reader);
                }

                if (val.HasValue)
                {
                    char parsedChar = GetCharFromNumericValue(val.Value);
                    tokenizer.EmitChar(parsedChar);
                } else
                {
                    tokenizer.EmitChar('&');
                }
            } else
            {
                // Consume the maximum number of characters possible, with the consumed characters matching one of the identifiers in the first column of the named character references table (in a case-sensitive manner).
                // If no match can be made, then no characters are consumed, and nothing is returned. In this case, if the characters after the U+0026 AMPERSAND character (&) consist of a sequence of one or more alphanumeric ASCII characters followed by a U+003B SEMICOLON character (;), then this is a parse error.
                // If the character reference is being consumed as part of an attribute, and the last character matched is not a ";" (U+003B) character, and the next character is either a "=" (U+003D) character or an alphanumeric ASCII character, then, for historical reasons, all the characters that were matched after the U+0026 AMPERSAND character (&) must be unconsumed, and nothing is returned. However, if this next character is in fact a "=" (U+003D) character, then this is a parse error, because some legacy user agents will misinterpret the markup in those cases.
                // Otherwise, a character reference is parsed. If the last character matched is not a ";" (U+003B) character, there is a parse error.
                // Return one or two character tokens for the character(s) corresponding to the character reference name (as given by the second column of the named character references table).
                //
                // Code Example:
                // If the markup contains (not in an attribute) the string "I'm &notit; I tell you", the character reference is
                // parsed as "not", as in, "I'm ¬it; I tell you" (and this is a parse error). But if the markup was
                // "I'm &notin; I tell you", the character reference would be parsed as "notin;", resulting in "I'm ∉ I tell you"
                // (and no parse error).

            }
            return DataState.Instance;
        }