public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
        {
            int c = Read(reader);
            if (IsWhitespace(c))
            {
                BeforeAttributeNameState.Instance.Token = Token.ContainingTag;
                return BeforeAttributeNameState.Instance;
            }

            if (c == '/')
            {
                SelfClosingStartTagState.Instance.Token = Token.ContainingTag;
                return SelfClosingStartTagState.Instance;
            }

            if (c == '>')
            {
                tokenizer.EmitToken(Token);
                return DataState.Instance;
            }

            if (c == -1)
            {
                ReportParseError();
                return DataState.Instance;
                //reconsume...
            }

            ReportParseError();
            BeforeAttributeNameState.Instance.Token = Token.ContainingTag;
            BeforeDocTypeNameState.Instance.LastConsumedCharacters.Enqueue((char)c);
            return BeforeAttributeNameState.Instance;
        }
        public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
        {
            int c;
            do
            {
                c = Read(reader);
            } while (IsWhitespace(c));

            if (c == '>')
            {
                tokenizer.EmitToken(Token);
                return DataState.Instance;
            }

            if (c == -1)
            {
                ReportParseError();
                Token.ForceQuirks = true;
                tokenizer.EmitToken(Token);
                return DataState.Instance;
                // Reconsume the EOF character (?)
            }

            ReportParseError();
            return BogusDocTypeState.Instance;
        }
 public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
 {
     int c = Read(reader);
     if (base.IsUppercaseAsciiLetter(c))
     {
         EndTagToken token = new EndTagToken(){ TagName = Char.ToLower((char)c).ToString() };
         //Create a new end tag token, and set its tag name to the lowercase version of the current input
         //character (add 0x0020 to the character's code point).
         //Append the current input character to the temporary buffer. (http://www.w3.org/TR/html5/syntax.html#temporary-buffer)
         //Finally, switch to the RCDATA end tag name state. (Don't emit the token yet;
         //further details will be filled in before it is emitted.)
         tokenizer.TemporaryBuffer.Add((char)c);
         RCDATAEndTagNameState.Instance.Token = token;
         return RCDATAEndTagNameState.Instance;
     }
     if (base.IsLowercaseAsciiLetter(c))
     {
         EndTagToken token = new EndTagToken(){ TagName = ((char)c).ToString() };
         //Create a new end tag token, and set its tag name to the current input character.
         //Append the current input character to the temporary buffer. (http://www.w3.org/TR/html5/syntax.html#temporary-buffer)
         //Finally, switch to the RCDATA end tag name state. (Don't emit the token yet;
         //further details will be filled in before it is emitted.)
         tokenizer.TemporaryBuffer.Add((char)c);
         RCDATAEndTagNameState.Instance.Token = token;
         return RCDATAEndTagNameState.Instance;
     }
     tokenizer.EmitChar('<');
     tokenizer.EmitChar('/');
     LastConsumedCharacters.Enqueue((char)c);
     return RCDATAState.Instance;
 }
Example #4
0
        /// <summary>
        /// Parse text from a text provider within a given range
        /// </summary>
        /// <param name="textProvider">Text provider</param>
        /// <param name="range">Range to parse</param>
        public void Parse(ITextProvider textProvider, ITextRange range)
        {
            DateTime?timeStart = null;

            if (Stats.Enabled)
            {
                timeStart = DateTime.UtcNow;
            }

            if (ParsingStarting != null)
            {
                ParsingStarting(this, new HtmlParserRangeEventArgs(range));
            }

            DocType = DocType.Undefined;

            _cs           = new HtmlCharStream(textProvider, range);
            _tokenizer    = new HtmlTokenizer(_cs);
            _softRangeEnd = range.End;

            OnTextState();

            if (ParsingComplete != null)
            {
                ParsingComplete(this, new HtmlParserRangeEventArgs(range));
            }

            if (Stats.Enabled)
            {
                Stats.ParseTime           = (DateTime.UtcNow - timeStart.Value);
                Stats.CharactersPerSecond = (int)(1000.0 * (double)_cs.Length / (double)Stats.ParseTime.TotalMilliseconds + 0.5);
            }
        }
Example #5
0
        public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
        {
            int c = Read(reader);
            if (base.IsUppercaseAsciiLetter(c))
            {
                EndTagToken token = new EndTagToken();
                token.TagName = Char.ToLower((char)c).ToString();
                TagNameState.Instance.Token = token;
                return TagNameState.Instance;
            }

            if (base.IsLowercaseAsciiLetter(c))
            {
                EndTagToken token = new EndTagToken();
                token.TagName = ((char)c).ToString();
                TagNameState.Instance.Token = token;
                return TagNameState.Instance;
            }

            if (c == '>')
            {
                ReportParseError();
                return DataState.Instance;
            }

            if (c == -1)
            {
                ReportParseError();
                return DataState.Instance;
            }

            ReportParseError();
            return BogusCommentState.Instance;
        }
Example #6
0
        static void Main(string[] args)
        {
            var tokenizer = new HtmlTokenizer(File.ReadAllText("simple.html"), true);

            Console.WriteLine("=== <Begin Tokenization> ===");

            var tokenizationResult = tokenizer.Run();
            var tokens             = tokenizationResult.Tokens;
            var errors             = tokenizationResult.Errors;

            Console.WriteLine("=== <End Tokenization> ===");
            Console.WriteLine();
            Console.WriteLine("=== <Begin Tokens> ===");

            foreach (var token in tokens)
            {
                Console.WriteLine($"{token}");
            }

            Console.WriteLine("=== <End Tokens> ===");
            Console.WriteLine();
            Console.WriteLine("=== <Begin Errors> ===");

            foreach (var error in errors)
            {
                Console.WriteLine($"{error}");
            }

            Console.WriteLine("=== <End Errors> ===");
        }
Example #7
0
        public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
        {
            int c = Read(reader);
            switch (c)
            {
                case '&':
                    return CharacterReferenceInRCDATAState.Instance;

                case '<':
                    return RCDATALessThanSignState.Instance;

                case 0:
                    ReportParseError();
                    tokenizer.EmitChar('\uFFFD');
                    return this;

                case -1:
                    tokenizer.EmitToken(new EndOfFileToken());
                    return this;

                default:
                    tokenizer.EmitChar((char)c);
                    return this;
            }
        }
        public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
        {
            for (;;)
            {
                int c = Read(reader);
                if (c == m_QuoteChar)
                {
                    AfterAttributeValueQuotedState.Instance.Token = Token;
                    return AfterAttributeValueQuotedState.Instance;
                }

                if (c == '&')
                {
                    CharacterReferenceInAttributeValueState.Instance.Process(reader, m_QuoteChar, Token);
                    continue;
                }

                if (c == 0)
                {
                    ReportParseError();
                    Token.AttributeValue += '\uFFFD';
                    continue;
                }

                if (c == -1)
                {
                    ReportParseError();
                    return DataState.Instance;
                    // Reconsume the EOF character. (?)
                }

                Token.AttributeValue += (char)c;
            }
        }
Example #9
0
        public void After_Cancelling_Lookahead_Tokenizer_Returns_Same_Tokens_As_It_Did_Before_Lookahead()
        {
            HtmlTokenizer tokenizer = new HtmlTokenizer(
                new SeekableTextReader(new StringReader("<foo>"))
                );

            using (tokenizer.Source.BeginLookahead())
            {
                Assert.Equal(
                    new HtmlSymbol(0, 0, 0, "<", HtmlSymbolType.OpenAngle),
                    tokenizer.NextSymbol()
                    );
                Assert.Equal(
                    new HtmlSymbol(1, 0, 1, "foo", HtmlSymbolType.Text),
                    tokenizer.NextSymbol()
                    );
                Assert.Equal(
                    new HtmlSymbol(4, 0, 4, ">", HtmlSymbolType.CloseAngle),
                    tokenizer.NextSymbol()
                    );
            }
            Assert.Equal(
                new HtmlSymbol(0, 0, 0, "<", HtmlSymbolType.OpenAngle),
                tokenizer.NextSymbol()
                );
            Assert.Equal(
                new HtmlSymbol(1, 0, 1, "foo", HtmlSymbolType.Text),
                tokenizer.NextSymbol()
                );
            Assert.Equal(
                new HtmlSymbol(4, 0, 4, ">", HtmlSymbolType.CloseAngle),
                tokenizer.NextSymbol()
                );
        }
Example #10
0
            public override TokenStream tokenStream(string fieldName, java.io.Reader reader)
            {
                TokenStream result = new HtmlTokenizer(reader);

                result = new LowerCaseFilter(result);
                return(result);
            }
Example #11
0
        public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
        {
            for (;;)
            {
                int c = Read(reader);
                switch (c)
                {
                    case '&':
                        CharacterReferenceInDataState.Instance.Process(tokenizer, reader, null);
                        break;

                    case '<':
                        return TagOpenState.Instance;

                    case '\0':
                        ReportParseError();
                        tokenizer.EmitChar('\0');
                        break;

                    case -1:
                        tokenizer.EmitToken(new EndOfFileToken());
                        return null;

                    default:
                        tokenizer.EmitChar((char)c);
                        return this; // Required to allow switching the state.
                }
            }
        }
            public void Push(HtmlContentIntermediateNode node)
            {
                var builder = new StringBuilder();

                var offsets = new List <(int offset, int sourceOffset)>();

                if (_content != null && _position < _content.Length)
                {
                    offsets.Add((0, _offsets[0].sourceOffset + _position));
                    builder.Append(_content, _position, _content.Length - _position);
                }

                for (var i = 0; i < node.Children.Count; i++)
                {
                    var token = node.Children[i] as IntermediateToken;
                    if (token != null && token.IsHtml)
                    {
                        offsets.Add((builder.Length, token.Source.Value.AbsoluteIndex));
                        builder.Append(token.Content);
                    }
                }

                _content   = builder.ToString();
                _offsets   = offsets;
                _tokenizer = new HtmlTokenizer(new TextSource(_content), HtmlEntityService.Resolver);
                _position  = 0;
            }
        public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
        {
            int c = Read(reader);
            switch(c){
                case '-':
                    CommentEndState.Instance.Token = Token;
                    return CommentEndState.Instance;

                case 0:
                    ReportParseError();
                    Token.Comment += "-\uFFFD";
                    CommentState.Instance.Token = Token;
                    return CommentState.Instance;

                case -1:
                    ReportParseError();
                    tokenizer.EmitToken(Token);
                    return DataState.Instance;
                    //Reconsume the EOF character (?)

                default:
                    Token.Comment += "-" + (char)c;
                    CommentState.Instance.Token = Token;
                    return CommentState.Instance;
            }
        }
Example #14
0
        public List <string> ExtractLinksHtmlKit()
        {
            stream.Seek(0, SeekOrigin.Begin);

            var htmlTokenizer = new HtmlTokenizer(new StreamReader(stream));

            var links = new List <string>();

            while (htmlTokenizer.ReadNextToken(out var token))
            {
                if (token.Kind != HtmlKit.HtmlTokenKind.Tag)
                {
                    continue;
                }

                var dataToken = (HtmlTagToken)token;
                if (dataToken.Name != "a")
                {
                    continue;
                }

                foreach (var attribute in dataToken.Attributes)
                {
                    if (attribute.Name != "href")
                    {
                        continue;
                    }

                    links.Add(attribute.Value);
                    break;
                }
            }

            return(links);
        }
Example #15
0
        public /*protected*/ override void ProcessDocument(Document document)
        {
            string contentType = document.Features.GetFeatureValue("contentType");

            if (contentType != "Html")
            {
                return;
            }
            try
            {
                HtmlTokenizer      htmlTokenizer = new HtmlTokenizer(document.Text, /*stemmer=*/ null, /*decode=*/ true, /*tokenize=*/ false, /*applySkipRules=*/ true);
                int                idx           = 0;
                ArrayList <string> txtBlocks     = new ArrayList <string>();
                bool               merge         = false;
                for (HtmlTokenizer.Enumerator e = (HtmlTokenizer.Enumerator)htmlTokenizer.GetEnumerator(); e.MoveNext();)
                {
                    if (e.CurrentToken.TokenType == HtmlTokenizer.TokenType.Text)
                    {
                        string textBlock = Utils.ToOneLine(e.Current.Trim(), /*compact=*/ true);
                        if (textBlock != "")
                        {
                            if (!merge)
                            {
                                txtBlocks.Add(textBlock);
                                document.AddAnnotation(new Annotation(idx, idx + textBlock.Length - 1, "TextBlock"));
                            }
                            else
                            {
                                idx--;
                                txtBlocks.Last += " " + textBlock;
                                int oldStartIdx = document.GetAnnotationAt(document.AnnotationCount - 1).SpanStart;
                                document.RemoveAnnotationAt(document.AnnotationCount - 1);
                                document.AddAnnotation(new Annotation(oldStartIdx, idx + textBlock.Length - 1, "TextBlock"));
                            }
                            idx  += textBlock.Length + 2;
                            merge = true;
                        }
                    }
                    else
                    {
                        if (mTagKeepList.Contains(e.CurrentToken.TagName.ToLower()))
                        {
                            merge = false;
                        }
                    }
                }
                StringBuilder sb = new StringBuilder();
                foreach (string textBlock in txtBlocks)
                {
                    sb.AppendLine(textBlock);
                }
                document.Text = sb.ToString();
                document.Features.SetFeatureValue("contentType", "Text");
            }
            catch (Exception exception)
            {
                mLogger.Error("ProcessDocument", exception);
            }
        }
Example #16
0
        public void TokenizationTagSelfClosingDetected()
        {
            var s     = new SourceManager("<img />");
            var t     = new HtmlTokenizer(s);
            var token = t.Get();

            Assert.AreEqual(true, ((HtmlTagToken)token).IsSelfClosing);
        }
Example #17
0
        public void TokenizationTagSpacesBehind()
        {
            var s     = new SourceManager("<i   >");
            var t     = new HtmlTokenizer(s);
            var token = t.Get();

            Assert.AreEqual("i", ((HtmlTagToken)token).Name);
        }
Example #18
0
        public void TokenizationCommentDetected()
        {
            var s     = new SourceManager("<!-- hi my friend -->");
            var t     = new HtmlTokenizer(s);
            var token = t.Get();

            Assert.AreEqual(HtmlTokenType.Comment, token.Type);
        }
Example #19
0
        public void TokenizationDoctypeDetected()
        {
            var s     = new SourceManager("<!doctype html>");
            var t     = new HtmlTokenizer(s);
            var token = t.Get();

            Assert.AreEqual(HtmlTokenType.DOCTYPE, token.Type);
        }
Example #20
0
        public void TokenizationTagNameDetection()
        {
            var s     = new SourceManager("<span>");
            var t     = new HtmlTokenizer(s);
            var token = t.Get();

            Assert.AreEqual("span", ((HtmlTagToken)token).Name);
        }
Example #21
0
        public void TokenizationAttributeNameDetection()
        {
            var s     = new SourceManager("<input required>");
            var t     = new HtmlTokenizer(s);
            var token = t.Get();

            Assert.AreEqual("required", ((HtmlTagToken)token).Attributes[0].Key);
        }
Example #22
0
        public void TokenizationAttributesDetected()
        {
            var s     = new SourceManager("<a target='_blank' href='http://whatever' title='ho'>");
            var t     = new HtmlTokenizer(s);
            var token = t.Get();

            Assert.AreEqual(3, ((HtmlTagToken)token).Attributes.Count);
        }
Example #23
0
        public void TokenizationDoctypeDetected()
        {
            var s     = new TextSource("<!doctype html>");
            var t     = new HtmlTokenizer(s, null);
            var token = t.Get();

            Assert.AreEqual(HtmlTokenType.Doctype, token.Type);
        }
Example #24
0
        public void TokenizationFinalEOF()
        {
            var s     = new TextSource("");
            var t     = new HtmlTokenizer(s, null);
            var token = t.Get();

            Assert.AreEqual(HtmlTokenType.EndOfFile, token.Type);
        }
Example #25
0
        public void TokenizationFinalEOF()
        {
            var s     = new SourceManager("");
            var t     = new HtmlTokenizer(s);
            var token = t.Get();

            Assert.AreEqual(HtmlTokenType.EOF, token.Type);
        }
Example #26
0
        public void TokenizationTagMixedCaseHandling()
        {
            var s     = new SourceManager("<InpUT>");
            var t     = new HtmlTokenizer(s);
            var token = t.Get();

            Assert.AreEqual("input", ((HtmlTagToken)token).Name);
        }
        public override BaseInsertionModeState ProcessToken(HtmlTokenizer tokenizer, ITokenQueue queue, BaseToken token, IDocument doc)
        {
            CharacterToken characterToken = token as CharacterToken;
            if (characterToken != null)
            {
                INode node = InsertCharacter(characterToken, doc);
                return this;
            }

            if (token is EndOfFileToken)
            {
                ReportParseError();
                //TODO - If the current node is a script element, mark the script element as "already started".
                //TODO - Pop the current node off the stack of open elements.
                return TreeConstruction.Instance.GetOriginalInsertionModeState();
            }

            EndTagToken endTagToken = token as EndTagToken;
            if (endTagToken != null && endTagToken.TagName == "script")
            {
                /* TODO:
                Perform a microtask checkpoint. (http://www.w3.org/TR/html5/webappapis.html#perform-a-microtask-checkpoint)
                Provide a stable state. (http://www.w3.org/TR/html5/webappapis.html#provide-a-stable-state)
                Let script be the current node (which will be a script element).
                Pop the current node off the stack of open elements.
              v Switch the insertion mode to the original insertion mode.
                Let the old insertion point have the same value as the current insertion point. Let the insertion point be just before the next input character.
                Increment the parser's script nesting level (http://www.w3.org/TR/html5/syntax.html#script-nesting-level) by one.
                Prepare the script. This might cause some script to execute, which might cause new characters to be inserted into the tokenizer, and might cause the tokenizer to output more tokens, resulting in a reentrant invocation of the parser.
                Decrement the parser's script nesting level by one. If the parser's script nesting level is zero, then set the parser pause flag to false.
                Let the insertion point have the value of the old insertion point. (In other words, restore the insertion point to its previous value. This value might be the "undefined" value.)
                At this stage, if there is a pending parsing-blocking script, then:

                    If the script nesting level is not zero:
                        Set the parser pause flag to true, and abort the processing of any nested invocations of the tokenizer, yielding control back to the caller. (Tokenization will resume when the caller returns to the "outer" tree construction stage.)
                        NOTE: The tree construction stage of this particular parser is being called reentrantly, say from a call to document.write().

                    Otherwise:
                        Run these steps:
                            1. Let the script be the pending parsing-blocking script. There is no longer a pending parsing-blocking script.
                            2. Block the tokenizer for this instance of the HTML parser, such that the event loop will not run tasks that invoke the tokenizer.
                            3. If the parser's Document has a style sheet that is blocking scripts or the script's "ready to be parser-executed" flag is not set: spin the event loop until the parser's Document has no style sheet that is blocking scripts and the script's "ready to be parser-executed" flag is set.
                            4. If this parser has been aborted in the meantime, abort these steps.
                               NOTE: This could happen if, e.g., while the spin the event loop algorithm is running, the browsing context gets closed, or the document.open() method gets invoked on the Document.
                            5. Unblock the tokenizer for this instance of the HTML parser, such that tasks that invoke the tokenizer can again be run.
                            6. Let the insertion point be just before the next input character.
                            7. Increment the parser's script nesting level by one (it should be zero before this step, so this sets it to one).
                            8. Execute the script.
                            9. Decrement the parser's script nesting level by one. If the parser's script nesting level is zero (which it always should be at this point), then set the parser pause flag to false.
                            10. Let the insertion point be undefined again.
                            11. If there is once again a pending parsing-blocking script, then repeat these steps from step 1.
                    */
                return TreeConstruction.Instance.GetOriginalInsertionModeState();
            }

            TreeConstruction.Instance.StackOfOpenElements.Pop();
            return TreeConstruction.Instance.GetOriginalInsertionModeState();
        }
Example #28
0
        public void TokenizationBogusCommentEmpty()
        {
            var s     = new TextSource("<!>");
            var t     = new HtmlTokenizer(s, null);
            var token = t.Get();

            Assert.AreEqual(HtmlTokenType.Comment, token.Type);
            Assert.AreEqual(String.Empty, token.Data);
        }
Example #29
0
        public void TokenizationBogusCommentQuestionMark()
        {
            var s     = new SourceManager("<?>");
            var t     = new HtmlTokenizer(s);
            var token = t.Get();

            Assert.AreEqual(HtmlTokenType.Comment, token.Type);
            Assert.AreEqual("?", ((HtmlCommentToken)token).Data);
        }
Example #30
0
        public void TokenizationBogusCommentClosingTag()
        {
            var s     = new SourceManager("</ >");
            var t     = new HtmlTokenizer(s);
            var token = t.Get();

            Assert.AreEqual(HtmlTokenType.Comment, token.Type);
            Assert.AreEqual(" ", ((HtmlCommentToken)token).Data);
        }
Example #31
0
        public void TokenizationStartTagDetection()
        {
            var s     = new SourceManager("<p>");
            var t     = new HtmlTokenizer(s);
            var token = t.Get();

            Assert.AreEqual(HtmlTokenType.StartTag, token.Type);
            Assert.AreEqual("p", ((HtmlTagToken)token).Name);
        }
Example #32
0
        public void TokenizationBogusCommentEmpty()
        {
            var s     = new SourceManager("<!>");
            var t     = new HtmlTokenizer(s);
            var token = t.Get();

            Assert.AreEqual(HtmlTokenType.Comment, token.Type);
            Assert.AreEqual(String.Empty, ((HtmlCommentToken)token).Data);
        }
 private BaseInsertionModeState ProcessUsingRulesOf(HtmlTokenizer tokenizer, ITokenQueue queue, BaseToken token, IDocument doc)
 {
     BaseInsertionModeState nextState = InBodyInsertionModeState.Instance.ProcessToken(tokenizer, queue, token, doc);
     if (nextState != InBodyInsertionModeState.Instance)
     {
         return nextState;
     }
     return this;
 }
Example #34
0
        public void TokenizationBogusCommentQuestionMark()
        {
            var s     = new TextSource("<?>");
            var t     = new HtmlTokenizer(s, null);
            var token = t.Get();

            Assert.AreEqual(HtmlTokenType.Comment, token.Type);
            Assert.AreEqual("?", token.Data);
        }
Example #35
0
        public void TokenizationBogusCommentClosingTag()
        {
            var s     = new TextSource("</ >");
            var t     = new HtmlTokenizer(s, null);
            var token = t.Get();

            Assert.AreEqual(HtmlTokenType.Comment, token.Type);
            Assert.AreEqual(" ", token.Data);
        }
        public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
        {
            int c;
            do
            {
                c = Read(reader);
            } while (IsWhitespace(c));

            if (c == '"')
            {
                AttributeValueQuotedState.InstanceDoubleQuoted.Token = Token;
                return AttributeValueQuotedState.InstanceDoubleQuoted;
            }

            if (c == '&')
            {
                AttributeValueUnquotedState.Instance.Token = Token;
                AttributeValueUnquotedState.Instance.LastConsumedCharacters.Enqueue((char)c);
                return AttributeValueUnquotedState.Instance;
            }

            if (c == '\'')
            {
                AttributeValueQuotedState.InstanceSingleQuoted.Token = Token;
                return AttributeValueQuotedState.InstanceSingleQuoted;
            }

            if (c == 0)
            {
                ReportParseError();
                Token.AttributeValue += "\uFFFD";
                AttributeValueUnquotedState.Instance.Token = Token;
                return AttributeValueUnquotedState.Instance;
            }

            if (c == '>')
            {
                ReportParseError();
                tokenizer.EmitToken(Token);
                return DataState.Instance;
            }

            if (c == -1)
            {
                ReportParseError();
                return DataState.Instance; //reconsume... ?
            }

            if (c == '<' || c == '=' || c == '`')
            {
                ReportParseError();
            }

            Token.AttributeValue += (char)c;
            AttributeValueUnquotedState.Instance.Token = Token;
            return AttributeValueUnquotedState.Instance;
        }
Example #37
0
        public void Should_Parse_Start_Tags(string source, string expectedTagName)
        {
            HtmlTokenizer tokenizer = new HtmlTokenizer(source);
            HtmlToken     token     = tokenizer.GetToken();

            Assert.True(token.Type == HtmlToken.TokenType.START_TAG);
            Assert.Equal(expectedTagName, token.Name);
            Assert.Equal(expectedTagName, token.Data);
        }
        public override BaseInsertionModeState ProcessToken(HtmlTokenizer tokenizer, ITokenQueue queue, BaseToken token, IDocument doc)
        {
            if (IsWhitespace(token))
            {
                return this;
            }

            CommentToken commentToken = token as CommentToken;
            if (commentToken != null)
            {
                InsertComment(commentToken, doc);
                return this;
            }

            if (token is DocTypeToken)
            {
                ReportParseError();
                return this;
            }

            StartTagToken startTagToken = token as StartTagToken;
            if (startTagToken != null)
            {
                if (startTagToken.TagName == "html")
                {
                    //TODO - Process the token using the rules for the "in body" insertion mode.
                }
                else if (startTagToken.TagName == "head")
                {
                    ((Document)doc).head = (IHTMLHeadElement)base.InsertHtmlElement(startTagToken, doc);
                    return InHeadInsertionModeState.Instance;
                }
            }

            EndTagToken endTagToken = token as EndTagToken;
            if ((endTagToken == null) ||
                (endTagToken != null &&
                 endTagToken.TagName != "head" &&
                 endTagToken.TagName != "body" &&
                 endTagToken.TagName != "html" &&
                 endTagToken.TagName != "br"))
            {
                ReportParseError();
                return this;
            }

            //Insert an HTML element for a "head" start tag token with no attributes.
            //Set the head element pointer to the newly created head element.
            StartTagToken dummyToken = new StartTagToken(){ TagName = "head" };
            ((Document)doc).head = (IHTMLHeadElement)InsertHtmlElement(dummyToken, doc);

            //Switch the insertion mode to "in head".
            //Reprocess the current token.
            queue.EnqueueTokenForReprocessing(token);
            return InHeadInsertionModeState.Instance;
        }
        public BaseState Process(HtmlTokenizer tokenizer, StreamReader reader, char? additionalAllowedCharacter)
        {
            // Switch to the data state.
            // Attempt to consume a character reference, with no additional allowed character.
            // (http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references)
            // (http://www.w3.org/TR/html5/syntax.html#additional-allowed-character)
            //
            // If nothing is returned, emit a U+0026 AMPERSAND character (&) token.
            // Otherwise, emit the character tokens that were returned.

            int c = Peek(reader);

            if (c == 9 || c == 0x0A || c == 0x0C || c == ' ' || c == '<' || c == -1 ||
                additionalAllowedCharacter.HasValue && c == additionalAllowedCharacter.Value)
            {

            } else if (c == '#')
            {
                Read(reader);
                int nc = Peek(reader);
                uint? val = null;
                if (nc == 'X' || nc == 'x')
                {
                    Read(reader);
                    val = ConsumeHexDigits(reader);
                    // http://www.w3.org/TR/html5/infrastructure.html#ascii-hex-digits
                } else
                {
                    val = ConsumeDigits(reader);
                }

                if (val.HasValue)
                {
                    char parsedChar = GetCharFromNumericValue(val.Value);
                    tokenizer.EmitChar(parsedChar);
                } else
                {
                    tokenizer.EmitChar('&');
                }
            } else
            {
                // Consume the maximum number of characters possible, with the consumed characters matching one of the identifiers in the first column of the named character references table (in a case-sensitive manner).
                // If no match can be made, then no characters are consumed, and nothing is returned. In this case, if the characters after the U+0026 AMPERSAND character (&) consist of a sequence of one or more alphanumeric ASCII characters followed by a U+003B SEMICOLON character (;), then this is a parse error.
                // If the character reference is being consumed as part of an attribute, and the last character matched is not a ";" (U+003B) character, and the next character is either a "=" (U+003D) character or an alphanumeric ASCII character, then, for historical reasons, all the characters that were matched after the U+0026 AMPERSAND character (&) must be unconsumed, and nothing is returned. However, if this next character is in fact a "=" (U+003D) character, then this is a parse error, because some legacy user agents will misinterpret the markup in those cases.
                // Otherwise, a character reference is parsed. If the last character matched is not a ";" (U+003B) character, there is a parse error.
                // Return one or two character tokens for the character(s) corresponding to the character reference name (as given by the second column of the named character references table).
                //
                // Code Example:
                // If the markup contains (not in an attribute) the string "I'm &notit; I tell you", the character reference is
                // parsed as "not", as in, "I'm ¬it; I tell you" (and this is a parse error). But if the markup was
                // "I'm &notin; I tell you", the character reference would be parsed as "notin;", resulting in "I'm ∉ I tell you"
                // (and no parse error).

            }
            return DataState.Instance;
        }
Example #40
0
        public void TokenizationLongerCharacterReference()
        {
            var content = "&abcdefghijklmnopqrstvwxyzABCDEFGHIJKLMNOPQRSTV;";
            var s       = new TextSource(content);
            var t       = new HtmlTokenizer(s, null);
            var token   = t.Get();

            Assert.AreEqual(HtmlTokenType.Character, token.Type);
            Assert.AreEqual(content, token.Data);
        }
Example #41
0
        public void TokenizationCDataDetected()
        {
            var s = new SourceManager("<![CDATA[hi mum how <!-- are you doing />]]>");
            var t = new HtmlTokenizer(s);

            t.AcceptsCharacterData = true;
            var token = t.Get();

            Assert.AreEqual(HtmlTokenType.Character, token.Type);
        }
Example #42
0
 /// <summary>
 /// Creates a new instance of the HTML parser with the specified
 /// document based on the given source manager.
 /// </summary>
 /// <param name="document">
 /// The document instance to be constructed.
 /// </param>
 internal HtmlDomBuilder(HtmlDocument document)
 {
     _tokenizer = new HtmlTokenizer(document.Source, document.Options.Events);
     _document = document;
     _openElements = new List<Element>();
     _templateModes = new Stack<HtmlTreeMode>();
     _formattingElements = new List<Element>();
     _frameset = true;
     _currentMode = HtmlTreeMode.Initial;
 }
Example #43
0
 /// <summary>
 /// Creates a new instance of the HTML parser with the specified
 /// document based on the given source manager.
 /// </summary>
 /// <param name="document">
 /// The document instance to be constructed.
 /// </param>
 internal HtmlDomBuilder(HtmlDocument document)
 {
     var resolver = document.Options.GetService<IEntityService>() ?? HtmlEntityService.Resolver;
     _tokenizer = new HtmlTokenizer(document.Source, document.Options.Events, resolver);
     _document = document;
     _openElements = new List<Element>();
     _templateModes = new Stack<HtmlTreeMode>();
     _formattingElements = new List<Element>();
     _frameset = true;
     _currentMode = HtmlTreeMode.Initial;
 }
Example #44
0
 public void After_Accepting_Lookahead_Tokenizer_Returns_Next_Token()
 {
     HtmlTokenizer tokenizer = new HtmlTokenizer(new SeekableTextReader(new StringReader("<foo>")));
     using (LookaheadToken lookahead = tokenizer.Source.BeginLookahead())
     {
         Assert.Equal(new HtmlSymbol(0, 0, 0, "<", HtmlSymbolType.OpenAngle), tokenizer.NextSymbol());
         Assert.Equal(new HtmlSymbol(1, 0, 1, "foo", HtmlSymbolType.Text), tokenizer.NextSymbol());
         lookahead.Accept();
     }
     Assert.Equal(new HtmlSymbol(4, 0, 4, ">", HtmlSymbolType.CloseAngle), tokenizer.NextSymbol());
 }
Example #45
0
        public void After_Accepting_Lookahead_Tokenizer_Returns_Next_Token()
        {
            var tokenizer = new HtmlTokenizer(new SeekableTextReader(new StringReader("<foo>")));

            using (LookaheadToken lookahead = tokenizer.Source.BeginLookahead())
            {
                Assert.Equal(new HtmlSymbol(0, 0, 0, "<", HtmlSymbolType.OpenAngle), tokenizer.NextSymbol());
                Assert.Equal(new HtmlSymbol(1, 0, 1, "foo", HtmlSymbolType.Text), tokenizer.NextSymbol());
                lookahead.Accept();
            }
            Assert.Equal(new HtmlSymbol(4, 0, 4, ">", HtmlSymbolType.CloseAngle), tokenizer.NextSymbol());
        }
 public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
 {
     int c = Read(reader);
     if (c == '/')
     {
         tokenizer.TemporaryBuffer.Clear();
         return RCDATAEndTagOpenState.Instance;
     }
     tokenizer.EmitChar('<');
     LastConsumedCharacters.Enqueue((char)c);
     return RCDATAState.Instance;
 }
		public void GetTokens_SingleVoidTag_ReturnsSequence()
		{
			const string input = @"<root />";
			var expected = new[]
			    {
			        MarkupGrammar.TokenElementVoid(new DataName("root"))
			    };

			var tokenizer = new HtmlTokenizer();
			var actual = tokenizer.GetTokens(input).ToArray();

			Assert.Equal(expected, actual);
		}
Example #48
0
 public void After_Cancelling_Lookahead_Tokenizer_Returns_Same_Tokens_As_It_Did_Before_Lookahead()
 {
     HtmlTokenizer tokenizer = new HtmlTokenizer(new SeekableTextReader(new StringReader("<foo>")));
     using (tokenizer.Source.BeginLookahead())
     {
         Assert.Equal(new HtmlSymbol(0, 0, 0, "<", HtmlSymbolType.OpenAngle), tokenizer.NextSymbol());
         Assert.Equal(new HtmlSymbol(1, 0, 1, "foo", HtmlSymbolType.Text), tokenizer.NextSymbol());
         Assert.Equal(new HtmlSymbol(4, 0, 4, ">", HtmlSymbolType.CloseAngle), tokenizer.NextSymbol());
     }
     Assert.Equal(new HtmlSymbol(0, 0, 0, "<", HtmlSymbolType.OpenAngle), tokenizer.NextSymbol());
     Assert.Equal(new HtmlSymbol(1, 0, 1, "foo", HtmlSymbolType.Text), tokenizer.NextSymbol());
     Assert.Equal(new HtmlSymbol(4, 0, 4, ">", HtmlSymbolType.CloseAngle), tokenizer.NextSymbol());
 }
        public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
        {
            int c;
            do
            {
                c = Read(reader);
            } while (IsWhitespace(c));

            if (c == '>')
            {
                tokenizer.EmitToken(Token);
                return DataState.Instance;
            }

            if (c == -1)
            {
                // Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag to on. Emit that DOCTYPE token. Reconsume the EOF character.
                ReportParseError();
                Token.ForceQuirks = true;
                tokenizer.EmitToken(Token);
                return DataState.Instance;
            }

            char[] buffer = new char[6];
            buffer[0] = (char)c;
            ReadBlock(reader, buffer, 1, 5);
            string bufferStr = new string(buffer);

            // If the six characters starting from the current input character are an ASCII case-insensitive match for the
            // word "PUBLIC", then consume those characters and switch to the after DOCTYPE public keyword state.
            if (bufferStr.Equals("public", StringComparison.OrdinalIgnoreCase))
            {
                AfterDocTypePublicKeywordState.Instance.Token = Token;
                return AfterDocTypePublicKeywordState.Instance;
            }

            // Otherwise, if the six characters starting from the current input character are an ASCII case-insensitive match
            // for the word "SYSTEM", then consume those characters and switch to the after DOCTYPE system keyword state.
            if (bufferStr.Equals("system", StringComparison.OrdinalIgnoreCase))
            {
                return AfterDocTypeSystemKeywordState.Instance;
            }

            // Otherwise, this is a parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the bogus DOCTYPE
            // state.
            reader.BaseStream.Seek(-6, SeekOrigin.Current);
            ReportParseError();
            Token.ForceQuirks = true;
            BogusCommentState.Instance.Comment.Append(bufferStr);
            return BogusCommentState.Instance;
        }
Example #50
0
        public void TokenizationUnusualDoctype()
        {
            var s = new SourceManager("<!DOCTYPE root_element SYSTEM \"DTD_location\">");
            var t = new HtmlTokenizer(s);
            var e = t.Get();

            Assert.AreEqual(HtmlTokenType.DOCTYPE, e.Type);
            var d = (HtmlDoctypeToken)e;

            Assert.IsFalse(d.IsNameMissing);
            Assert.AreEqual("root_element", d.Name);
            Assert.IsFalse(d.IsSystemIdentifierMissing);
            Assert.AreEqual("DTD_location", d.SystemIdentifier);
        }
        public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
        {
            int c = Read(reader);
            if (IsWhitespace(c))
            {
                if (tokenizer.IsAppropriateEndTagToken(Token))
                {
                    return BeforeAttributeNameState.Instance;
                }
            }
            else if (c == '/')
            {
                if (tokenizer.IsAppropriateEndTagToken(Token))
                {
                    return SelfClosingStartTagState.Instance;
                }
            }
            else if (c == '>')
            {
                if (tokenizer.IsAppropriateEndTagToken(Token))
                {
                    tokenizer.EmitToken(Token); // TODO - is this the right token to emit?
                    return DataState.Instance;
                }
            }
            else if (IsUppercaseAsciiLetter(c))
            {
                Token.TagName += Char.ToLower((char)c);
                tokenizer.TemporaryBuffer.Add((char)c);
                return this;
            }
            else if (IsLowercaseAsciiLetter(c))
            {
                Token.TagName += (char)c;
                tokenizer.TemporaryBuffer.Add((char)c);
                return this;
            }

            tokenizer.EmitChar('<');
            tokenizer.EmitChar('/');
            foreach (char bc in tokenizer.TemporaryBuffer)
            {
                tokenizer.EmitChar(bc);
            }
            RCDATAState.Instance.LastConsumedCharacters.Enqueue((char)c);
            return RCDATAState.Instance;
        }
        public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
        {
            int c = Read(reader);
            if (IsWhitespace(c))
            {
                BeforeDocTypePublicIdentifierState.Instance.Token = Token;
                return BeforeDocTypePublicIdentifierState.Instance;
            }

            if (c == '"')
            {
                ReportParseError();
                Token.PublicIdentifier = string.Empty;
                DocTypePublicIdentifierQuotedState.InstanceDoubleQuoted.Token = Token;
                return DocTypePublicIdentifierQuotedState.InstanceDoubleQuoted;
            }

            if (c == '\'')
            {
                ReportParseError();
                Token.PublicIdentifier = string.Empty;
                DocTypePublicIdentifierQuotedState.InstanceSingleQuoted.Token = Token;
                return DocTypePublicIdentifierQuotedState.InstanceSingleQuoted;
            }

            if (c == '>')
            {
                ReportParseError();
                Token.ForceQuirks = true;
                tokenizer.EmitToken(Token);
                return DataState.Instance;
            }

            if (c == -1)
            {
                ReportParseError();
                Token.ForceQuirks = true;
                tokenizer.EmitToken(Token);
                return DataState.Instance;
                // Reconsume the EOF character. (?)
            }

            ReportParseError();
            Token.ForceQuirks = true;
            BogusDocTypeState.Instance.Token = Token;
            return BogusDocTypeState.Instance;
        }
Example #53
0
 /// <summary>
 /// Creates a new instance of the HTML parser with the specified
 /// document based on the given source manager.
 /// </summary>
 /// <param name="document">
 /// The document instance to be constructed.
 /// </param>
 public HtmlDomBuilder(HtmlDocument document)
 {
     var options = document.Options;
     var context = document.Context;
     var resolver = options.GetProvider<IEntityProvider>() ?? HtmlEntityService.Resolver;
     _tokenizer = new HtmlTokenizer(document.Source, resolver);
     _tokenizer.Error += (_, error) => context.Fire(error);
     _document = document;
     _openElements = new List<Element>();
     _templateModes = new Stack<HtmlTreeMode>();
     _formattingElements = new List<Element>();
     _frameset = true;
     _currentMode = HtmlTreeMode.Initial;
     _htmlFactory = options.GetFactory<IElementFactory<HtmlElement>>();
     _mathFactory = options.GetFactory<IElementFactory<MathElement>>();
     _svgFactory = options.GetFactory<IElementFactory<SvgElement>>();
 }
        public override BaseInsertionModeState ProcessToken(HtmlTokenizer tokenizer, ITokenQueue queue, BaseToken token, IDocument doc)
        {
            if (IsWhitespace(token))
            {
                return ProcessUsingRulesOf(tokenizer, queue, token, doc);
            }

            CommentToken comment = token as CommentToken;
            if (comment != null)
            {
                //TODO - Insert a comment as the last child of the first element in the stack of open elements (the html element).
                return this;
            }

            if (token is DocTypeToken)
            {
                ReportParseError();
                return this;
            }

            StartTagToken startToken = token as StartTagToken;
            if (startToken != null && startToken.TagName == "html")
            {
                return ProcessUsingRulesOf(tokenizer, queue, token, doc);
            }

            EndTagToken endToken = token as EndTagToken;
            if (endToken != null && endToken.TagName == "html")
            {
                //TODO - If the parser was originally created as part of the HTML fragment parsing algorithm,
                //TODO - this is a parse error; ignore the token. (fragment case)
                //TODO - Otherwise, switch the insertion mode to "after after body".
                return AfterAfterBodyInsertionModeState.Instance;
            }

            if (token is EndOfFileToken)
            {
                base.StopParsing();
                return null;
            }

            ReportParseError();
            queue.EnqueueTokenForReprocessing(token);
            return InBodyInsertionModeState.Instance;
        }
Example #55
0
        public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
        {
            for (;;)
            {
                int c = Read(reader);
                if (IsWhitespace(c))
                {
                    AfterDocTypeNameState.Instance.Token = Token;
                    return AfterDocTypeNameState.Instance;
                }

                if (c == '>')
                {
                    tokenizer.EmitToken(Token);
                    return DataState.Instance;
                }

                if (base.IsUppercaseAsciiLetter(c))
                {
                    // Append the lowercase version of the current input character (add 0x0020 to the character's code point) to the current tag token's tag name.
                    Token.Name += Char.ToLower((char)c);
                    continue;
                }

                if (c == 0)
                {
                    // Parse error. Append a U+FFFD REPLACEMENT CHARACTER character to the current tag token's tag name.
                    ReportParseError();
                    Token.Name += "\uFFFD";
                    continue;
                }

                if (c == -1)
                {
                    // Parse error. Switch to the data state. Reconsume the EOF character.
                    ReportParseError();
                    Token.ForceQuirks = true;
                    tokenizer.EmitToken(Token);
                    return DataState.Instance;
                }

                // Append the current input character to the current tag token's tag name.
                Token.Name += (char)c;
            }
        }
        public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
        {
            int c;
            do
            {
                c = Read(reader);
            } while (IsWhitespace(c));

            DocTypeToken token = new DocTypeToken();
            DocTypeNameState.Instance.Token = token;

            if (base.IsUppercaseAsciiLetter(c))
            {
                token.Name = Char.ToLower((char)c).ToString();
                return DocTypeNameState.Instance;
            }

            if (c == 0)
            {
                ReportParseError();
                token.Name = "\uFFFD";
                return DocTypeNameState.Instance;
            }

            if (c == '>')
            {
                ReportParseError();
                token.ForceQuirks = true;
                tokenizer.EmitToken(token);
                return DataState.Instance;
            }

            if (c == -1)
            {
                ReportParseError();
                token.ForceQuirks = true;
                tokenizer.EmitToken(token);
                return DataState.Instance;
                // Parse error. Switch to the data state. Reconsume the EOF character.
            }

            token.Name = ((char)c).ToString();
            return DocTypeNameState.Instance;
        }
Example #57
0
 public override BaseState Process(HtmlTokenizer tokenizer, System.IO.StreamReader reader)
 {
     for (;;){
         int c = Read(reader);
         switch(c)
         {
             case 0:
                 ReportParseError();
                 tokenizer.EmitChar('\uFFFD');
                 break;
             case -1:
                 tokenizer.EmitToken(new EndOfFileToken());
                 break;
             default:
                 tokenizer.EmitChar((char)c);
                 return this;
         }
     }
 }
Example #58
0
        public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
        {
            int c = Read(reader);
            if (IsWhitespace(c))
            {
                return BeforeDocTypeNameState.Instance;
            }

            if (c==-1)
            {
                ReportParseError();
                DocTypeToken token = new DocTypeToken() { ForceQuirks = true };
                return DataState.Instance;
                // Parse error. Switch to the data state. Reconsume the EOF character.
            }

            ReportParseError();
            BeforeDocTypeNameState.Instance.LastConsumedCharacters.Enqueue((char)c);
            return BeforeDocTypeNameState.Instance;
        }
Example #59
0
        public override BaseState Process(HtmlTokenizer tokenizer, StreamReader reader)
        {
            char c = (char)Read(reader);
            if (c == '!')
            {
                return MarkupDeclarationOpenState.Instance;
            }

            if (c == '/')
            {
                return EndTagOpenState.Instance;
            }

            if (base.IsUppercaseAsciiLetter(c))
            {
                StartTagToken token = new StartTagToken();
                token.TagName = Char.ToLower(c).ToString();
                TagNameState.Instance.Token = token;
                return TagNameState.Instance;
            }

            if (base.IsLowercaseAsciiLetter(c))
            {
                StartTagToken token = new StartTagToken();
                token.TagName = ((char)c).ToString();
                TagNameState.Instance.Token = token;
                return TagNameState.Instance;
            }

            if (c == '?')
            {
                ReportParseError();
                return BogusCommentState.Instance;
            }

            ReportParseError();
            tokenizer.EmitChar(c);
            DataState.Instance.LastConsumedCharacters.Enqueue(c);
            return DataState.Instance;
        }
 public abstract BaseInsertionModeState ProcessToken(HtmlTokenizer tokenizer, ITokenQueue queue, BaseToken token, IDocument doc);