Example #1
0
 public void TestAsyncHtmlParsing()
 {
     var source = "<html><head><title>My test</title></head><body><p>Some text</p></body></html>";
     var parser = new HtmlParser(source);
     var task = parser.ParseAsync();
     Assert.IsFalse(task.IsCompleted);
     Assert.IsNotNull(parser.Result);
     Assert.IsFalse(task.IsCompleted);
     task.Wait();
     Assert.IsTrue(task.IsCompleted);
     Assert.IsNotNull(parser.Result);
     Assert.AreEqual("My test", parser.Result.Title);
     Assert.AreEqual(1, parser.Result.Body.ChildElementCount);
     Assert.AreEqual("Some text", parser.Result.Body.Children[0].TextContent);
 }
Example #2
0
        static async Task TestAsync()
        {
            Console.WriteLine("Starting async!");
            var sw = Stopwatch.StartNew();
            var parser = new HtmlParser(HtmlFiles.W3C);

            var task = parser.ParseAsync();

            while (!task.IsCompleted)
            {
                await Task.Delay(15);
                Console.WriteLine("{0} | {1} elements", sw.ElapsedMilliseconds, parser.Result.All.Length);
            }

            sw.Stop();
            Console.WriteLine("Done!");
        }
Example #3
0
        public HTMLDocument Load(String url)
        {
            location = url;
            Cookie = new Cookie();

            for (int i = _children.Length - 1; i >= 0; i++)
                RemoveChild(_children[i]);

            ReadyState = Readiness.Loading;
            QuirksMode = QuirksMode.Off;
            var stream = Builder.Stream(url);
            var source = new SourceManager(stream);
            var parser = new HtmlParser(this, source);
            return parser.Result;
        }
Example #4
0
        public void ScriptNoScriptWithCommentStartAndTextInsideBeforeClosing()
        {
            var doc = new HTMLDocument();
            var parser = new HtmlParser(doc, "<!doctype html><noscript><!--</noscript>X<noscript>--></noscript>");
            doc.Options = new DocumentOptions(scripting: true);
            parser.Parse();

            var docType0 = doc.ChildNodes[0] as DocumentType;
            Assert.IsNotNull(docType0);
            Assert.AreEqual(NodeType.DocumentType, docType0.NodeType);
            Assert.AreEqual(@"html", docType0.Name);

            var dochtml1 = doc.ChildNodes[1];
            Assert.AreEqual(2, dochtml1.ChildNodes.Length);
            Assert.AreEqual(0, dochtml1.Attributes.Length);
            Assert.AreEqual("html", dochtml1.NodeName);
            Assert.AreEqual(NodeType.Element, dochtml1.NodeType);

            var dochtml1head0 = dochtml1.ChildNodes[0];
            Assert.AreEqual(1, dochtml1head0.ChildNodes.Length);
            Assert.AreEqual(0, dochtml1head0.Attributes.Length);
            Assert.AreEqual("head", dochtml1head0.NodeName);
            Assert.AreEqual(NodeType.Element, dochtml1head0.NodeType);

            var dochtml1head0noscript0 = dochtml1head0.ChildNodes[0];
            Assert.AreEqual(1, dochtml1head0noscript0.ChildNodes.Length);
            Assert.AreEqual(0, dochtml1head0noscript0.Attributes.Length);
            Assert.AreEqual("noscript", dochtml1head0noscript0.NodeName);
            Assert.AreEqual(NodeType.Element, dochtml1head0noscript0.NodeType);

            var dochtml1head0noscript0Text0 = dochtml1head0noscript0.ChildNodes[0];
            Assert.AreEqual(NodeType.Text, dochtml1head0noscript0Text0.NodeType);
            Assert.AreEqual("<!--", dochtml1head0noscript0Text0.TextContent);

            var dochtml1body1 = dochtml1.ChildNodes[1];
            Assert.AreEqual(2, dochtml1body1.ChildNodes.Length);
            Assert.AreEqual(0, dochtml1body1.Attributes.Length);
            Assert.AreEqual("body", dochtml1body1.NodeName);
            Assert.AreEqual(NodeType.Element, dochtml1body1.NodeType);

            var dochtml1body1Text0 = dochtml1body1.ChildNodes[0];
            Assert.AreEqual(NodeType.Text, dochtml1body1Text0.NodeType);
            Assert.AreEqual("X", dochtml1body1Text0.TextContent);

            var dochtml1body1noscript1 = dochtml1body1.ChildNodes[1];
            Assert.AreEqual(1, dochtml1body1noscript1.ChildNodes.Length);
            Assert.AreEqual(0, dochtml1body1noscript1.Attributes.Length);
            Assert.AreEqual("noscript", dochtml1body1noscript1.NodeName);
            Assert.AreEqual(NodeType.Element, dochtml1body1noscript1.NodeType);

            var dochtml1body1noscript1Text0 = dochtml1body1noscript1.ChildNodes[0];
            Assert.AreEqual(NodeType.Text, dochtml1body1noscript1Text0.NodeType);
            Assert.AreEqual("-->", dochtml1body1noscript1Text0.TextContent);
        }
Example #5
0
        public void ScriptNoScriptAfterDoctypeWithIFrameContentAndTextAfter()
        {
            var doc = new HTMLDocument();
            var parser = new HtmlParser(doc, "<!doctype html><noscript><iframe></noscript>X");
            doc.Options = new DocumentOptions(scripting: true);
            parser.Parse();

            var docType0 = doc.ChildNodes[0] as DocumentType;
            Assert.IsNotNull(docType0);
            Assert.AreEqual(NodeType.DocumentType, docType0.NodeType);
            Assert.AreEqual(@"html", docType0.Name);

            var dochtml1 = doc.ChildNodes[1];
            Assert.AreEqual(2, dochtml1.ChildNodes.Length);
            Assert.AreEqual(0, dochtml1.Attributes.Length);
            Assert.AreEqual("html", dochtml1.NodeName);
            Assert.AreEqual(NodeType.Element, dochtml1.NodeType);

            var dochtml1head0 = dochtml1.ChildNodes[0];
            Assert.AreEqual(1, dochtml1head0.ChildNodes.Length);
            Assert.AreEqual(0, dochtml1head0.Attributes.Length);
            Assert.AreEqual("head", dochtml1head0.NodeName);
            Assert.AreEqual(NodeType.Element, dochtml1head0.NodeType);

            var dochtml1head0noscript0 = dochtml1head0.ChildNodes[0];
            Assert.AreEqual(1, dochtml1head0noscript0.ChildNodes.Length);
            Assert.AreEqual(0, dochtml1head0noscript0.Attributes.Length);
            Assert.AreEqual("noscript", dochtml1head0noscript0.NodeName);
            Assert.AreEqual(NodeType.Element, dochtml1head0noscript0.NodeType);

            var dochtml1head0noscript0Text0 = dochtml1head0noscript0.ChildNodes[0];
            Assert.AreEqual(NodeType.Text, dochtml1head0noscript0Text0.NodeType);
            Assert.AreEqual("<iframe>", dochtml1head0noscript0Text0.TextContent);

            var dochtml1body1 = dochtml1.ChildNodes[1];
            Assert.AreEqual(1, dochtml1body1.ChildNodes.Length);
            Assert.AreEqual(0, dochtml1body1.Attributes.Length);
            Assert.AreEqual("body", dochtml1body1.NodeName);
            Assert.AreEqual(NodeType.Element, dochtml1body1.NodeType);

            var dochtml1body1Text0 = dochtml1body1.ChildNodes[0];
            Assert.AreEqual(NodeType.Text, dochtml1body1Text0.NodeType);
            Assert.AreEqual("X", dochtml1body1Text0.TextContent);
        }
Example #6
0
        public void TreeParagraphWithTightAttributesAndNoScriptTagScriptingEnabled()
        {
            var doc = new HTMLDocument();
            var parser = new HtmlParser(doc, @"<p id=""status""><noscript><strong>A</strong></noscript><span>B</span></p>");
            doc.Options = new DocumentOptions(scripting: true);
            parser.Parse();

            var dochtml0 = doc.ChildNodes[0];
            Assert.AreEqual(2, dochtml0.ChildNodes.Length);
            Assert.AreEqual(0, dochtml0.Attributes.Length);
            Assert.AreEqual("html", dochtml0.NodeName);
            Assert.AreEqual(NodeType.Element, dochtml0.NodeType);

            var dochtml0head0 = dochtml0.ChildNodes[0];
            Assert.AreEqual(0, dochtml0head0.ChildNodes.Length);
            Assert.AreEqual(0, dochtml0head0.Attributes.Length);
            Assert.AreEqual("head", dochtml0head0.NodeName);
            Assert.AreEqual(NodeType.Element, dochtml0head0.NodeType);

            var dochtml0body1 = dochtml0.ChildNodes[1];
            Assert.AreEqual(1, dochtml0body1.ChildNodes.Length);
            Assert.AreEqual(0, dochtml0body1.Attributes.Length);
            Assert.AreEqual("body", dochtml0body1.NodeName);
            Assert.AreEqual(NodeType.Element, dochtml0body1.NodeType);

            var dochtml0body1p0 = dochtml0body1.ChildNodes[0];
            Assert.AreEqual(2, dochtml0body1p0.ChildNodes.Length);
            Assert.AreEqual(1, dochtml0body1p0.Attributes.Length);
            Assert.AreEqual("p", dochtml0body1p0.NodeName);
            Assert.AreEqual(NodeType.Element, dochtml0body1p0.NodeType);
            Assert.AreEqual("status", dochtml0body1p0.Attributes["id"].Value);

            var dochtml0body1p0noscript0 = dochtml0body1p0.ChildNodes[0];
            Assert.AreEqual(1, dochtml0body1p0noscript0.ChildNodes.Length);
            Assert.AreEqual(0, dochtml0body1p0noscript0.Attributes.Length);
            Assert.AreEqual("noscript", dochtml0body1p0noscript0.NodeName);
            Assert.AreEqual(NodeType.Element, dochtml0body1p0noscript0.NodeType);

            var dochtml0body1p0noscript0Text0 = dochtml0body1p0noscript0.ChildNodes[0];
            Assert.AreEqual(NodeType.Text, dochtml0body1p0noscript0Text0.NodeType);
            Assert.AreEqual("<strong>A</strong>", dochtml0body1p0noscript0Text0.TextContent);

            var dochtml0body1p0span1 = dochtml0body1p0.ChildNodes[1];
            Assert.AreEqual(1, dochtml0body1p0span1.ChildNodes.Length);
            Assert.AreEqual(0, dochtml0body1p0span1.Attributes.Length);
            Assert.AreEqual("span", dochtml0body1p0span1.NodeName);
            Assert.AreEqual(NodeType.Element, dochtml0body1p0span1.NodeType);

            var dochtml0body1p0span1Text0 = dochtml0body1p0span1.ChildNodes[0];
            Assert.AreEqual(NodeType.Text, dochtml0body1p0span1Text0.NodeType);
            Assert.AreEqual("B", dochtml0body1p0span1Text0.TextContent);
        }
Example #7
0
        public void TreeNoScriptWithNoScriptCommentInside()
        {
            var doc = new HTMLDocument();
            var parser = new HtmlParser(doc, @"<noscript><!--<noscript></noscript>--></noscript>");
            doc.Options = new DocumentOptions(scripting: true);
            parser.Parse();

            var dochtml0 = doc.ChildNodes[0];
            Assert.AreEqual(2, dochtml0.ChildNodes.Length);
            Assert.AreEqual(0, dochtml0.Attributes.Length);
            Assert.AreEqual("html", dochtml0.NodeName);
            Assert.AreEqual(NodeType.Element, dochtml0.NodeType);

            var dochtml0head0 = dochtml0.ChildNodes[0];
            Assert.AreEqual(1, dochtml0head0.ChildNodes.Length);
            Assert.AreEqual(0, dochtml0head0.Attributes.Length);
            Assert.AreEqual("head", dochtml0head0.NodeName);
            Assert.AreEqual(NodeType.Element, dochtml0head0.NodeType);

            var dochtml0head0noscript0 = dochtml0head0.ChildNodes[0];
            Assert.AreEqual(1, dochtml0head0noscript0.ChildNodes.Length);
            Assert.AreEqual(0, dochtml0head0noscript0.Attributes.Length);
            Assert.AreEqual("noscript", dochtml0head0noscript0.NodeName);
            Assert.AreEqual(NodeType.Element, dochtml0head0noscript0.NodeType);

            var dochtml0head0noscript0Text0 = dochtml0head0noscript0.ChildNodes[0];
            Assert.AreEqual(NodeType.Text, dochtml0head0noscript0Text0.NodeType);
            Assert.AreEqual("<!--<noscript>", dochtml0head0noscript0Text0.TextContent);

            var dochtml0body1 = dochtml0.ChildNodes[1];
            Assert.AreEqual(1, dochtml0body1.ChildNodes.Length);
            Assert.AreEqual(0, dochtml0body1.Attributes.Length);
            Assert.AreEqual("body", dochtml0body1.NodeName);
            Assert.AreEqual(NodeType.Element, dochtml0body1.NodeType);

            var dochtml0body1Text0 = dochtml0body1.ChildNodes[0];
            Assert.AreEqual(NodeType.Text, dochtml0body1Text0.NodeType);
            Assert.AreEqual("-->", dochtml0body1Text0.TextContent);
        }
Example #8
0
        public HTMLDocument Load(String url)
        {
            _location.Href = url;
            Cookie = new Cookie();

            for (int i = _children.Length - 1; i >= 0; i++)
                RemoveChild(_children[i]);

            ReadyState = Readiness.Loading;
            QuirksMode = QuirksMode.Off;
            var task = Builder.GetFromUrl(url);

            task.ContinueWith(m =>
            {
                if (m.IsCompleted && !m.IsFaulted)
                {
                    var stream = m.Result;
                    var source = new SourceManager(stream);
                    var parser = new HtmlParser(this, source);
                    parser.Parse();
                }
            });

            return this;
        }