public void TestAsyncHtmlParsing() { var source = "<html><head><title>My test</title></head><body><p>Some text</p></body></html>"; var parser = new HtmlParser(source); var task = parser.ParseAsync(); Assert.IsFalse(task.IsCompleted); Assert.IsNotNull(parser.Result); Assert.IsFalse(task.IsCompleted); task.Wait(); Assert.IsTrue(task.IsCompleted); Assert.IsNotNull(parser.Result); Assert.AreEqual("My test", parser.Result.Title); Assert.AreEqual(1, parser.Result.Body.ChildElementCount); Assert.AreEqual("Some text", parser.Result.Body.Children[0].TextContent); }
static async Task TestAsync() { Console.WriteLine("Starting async!"); var sw = Stopwatch.StartNew(); var parser = new HtmlParser(HtmlFiles.W3C); var task = parser.ParseAsync(); while (!task.IsCompleted) { await Task.Delay(15); Console.WriteLine("{0} | {1} elements", sw.ElapsedMilliseconds, parser.Result.All.Length); } sw.Stop(); Console.WriteLine("Done!"); }
public HTMLDocument Load(String url) { location = url; Cookie = new Cookie(); for (int i = _children.Length - 1; i >= 0; i++) RemoveChild(_children[i]); ReadyState = Readiness.Loading; QuirksMode = QuirksMode.Off; var stream = Builder.Stream(url); var source = new SourceManager(stream); var parser = new HtmlParser(this, source); return parser.Result; }
public void ScriptNoScriptWithCommentStartAndTextInsideBeforeClosing() { var doc = new HTMLDocument(); var parser = new HtmlParser(doc, "<!doctype html><noscript><!--</noscript>X<noscript>--></noscript>"); doc.Options = new DocumentOptions(scripting: true); parser.Parse(); var docType0 = doc.ChildNodes[0] as DocumentType; Assert.IsNotNull(docType0); Assert.AreEqual(NodeType.DocumentType, docType0.NodeType); Assert.AreEqual(@"html", docType0.Name); var dochtml1 = doc.ChildNodes[1]; Assert.AreEqual(2, dochtml1.ChildNodes.Length); Assert.AreEqual(0, dochtml1.Attributes.Length); Assert.AreEqual("html", dochtml1.NodeName); Assert.AreEqual(NodeType.Element, dochtml1.NodeType); var dochtml1head0 = dochtml1.ChildNodes[0]; Assert.AreEqual(1, dochtml1head0.ChildNodes.Length); Assert.AreEqual(0, dochtml1head0.Attributes.Length); Assert.AreEqual("head", dochtml1head0.NodeName); Assert.AreEqual(NodeType.Element, dochtml1head0.NodeType); var dochtml1head0noscript0 = dochtml1head0.ChildNodes[0]; Assert.AreEqual(1, dochtml1head0noscript0.ChildNodes.Length); Assert.AreEqual(0, dochtml1head0noscript0.Attributes.Length); Assert.AreEqual("noscript", dochtml1head0noscript0.NodeName); Assert.AreEqual(NodeType.Element, dochtml1head0noscript0.NodeType); var dochtml1head0noscript0Text0 = dochtml1head0noscript0.ChildNodes[0]; Assert.AreEqual(NodeType.Text, dochtml1head0noscript0Text0.NodeType); Assert.AreEqual("<!--", dochtml1head0noscript0Text0.TextContent); var dochtml1body1 = dochtml1.ChildNodes[1]; Assert.AreEqual(2, dochtml1body1.ChildNodes.Length); Assert.AreEqual(0, dochtml1body1.Attributes.Length); Assert.AreEqual("body", dochtml1body1.NodeName); Assert.AreEqual(NodeType.Element, dochtml1body1.NodeType); var dochtml1body1Text0 = dochtml1body1.ChildNodes[0]; Assert.AreEqual(NodeType.Text, dochtml1body1Text0.NodeType); Assert.AreEqual("X", dochtml1body1Text0.TextContent); var dochtml1body1noscript1 = dochtml1body1.ChildNodes[1]; Assert.AreEqual(1, dochtml1body1noscript1.ChildNodes.Length); Assert.AreEqual(0, dochtml1body1noscript1.Attributes.Length); Assert.AreEqual("noscript", dochtml1body1noscript1.NodeName); Assert.AreEqual(NodeType.Element, dochtml1body1noscript1.NodeType); var dochtml1body1noscript1Text0 = dochtml1body1noscript1.ChildNodes[0]; Assert.AreEqual(NodeType.Text, dochtml1body1noscript1Text0.NodeType); Assert.AreEqual("-->", dochtml1body1noscript1Text0.TextContent); }
public void ScriptNoScriptAfterDoctypeWithIFrameContentAndTextAfter() { var doc = new HTMLDocument(); var parser = new HtmlParser(doc, "<!doctype html><noscript><iframe></noscript>X"); doc.Options = new DocumentOptions(scripting: true); parser.Parse(); var docType0 = doc.ChildNodes[0] as DocumentType; Assert.IsNotNull(docType0); Assert.AreEqual(NodeType.DocumentType, docType0.NodeType); Assert.AreEqual(@"html", docType0.Name); var dochtml1 = doc.ChildNodes[1]; Assert.AreEqual(2, dochtml1.ChildNodes.Length); Assert.AreEqual(0, dochtml1.Attributes.Length); Assert.AreEqual("html", dochtml1.NodeName); Assert.AreEqual(NodeType.Element, dochtml1.NodeType); var dochtml1head0 = dochtml1.ChildNodes[0]; Assert.AreEqual(1, dochtml1head0.ChildNodes.Length); Assert.AreEqual(0, dochtml1head0.Attributes.Length); Assert.AreEqual("head", dochtml1head0.NodeName); Assert.AreEqual(NodeType.Element, dochtml1head0.NodeType); var dochtml1head0noscript0 = dochtml1head0.ChildNodes[0]; Assert.AreEqual(1, dochtml1head0noscript0.ChildNodes.Length); Assert.AreEqual(0, dochtml1head0noscript0.Attributes.Length); Assert.AreEqual("noscript", dochtml1head0noscript0.NodeName); Assert.AreEqual(NodeType.Element, dochtml1head0noscript0.NodeType); var dochtml1head0noscript0Text0 = dochtml1head0noscript0.ChildNodes[0]; Assert.AreEqual(NodeType.Text, dochtml1head0noscript0Text0.NodeType); Assert.AreEqual("<iframe>", dochtml1head0noscript0Text0.TextContent); var dochtml1body1 = dochtml1.ChildNodes[1]; Assert.AreEqual(1, dochtml1body1.ChildNodes.Length); Assert.AreEqual(0, dochtml1body1.Attributes.Length); Assert.AreEqual("body", dochtml1body1.NodeName); Assert.AreEqual(NodeType.Element, dochtml1body1.NodeType); var dochtml1body1Text0 = dochtml1body1.ChildNodes[0]; Assert.AreEqual(NodeType.Text, dochtml1body1Text0.NodeType); Assert.AreEqual("X", dochtml1body1Text0.TextContent); }
public void TreeParagraphWithTightAttributesAndNoScriptTagScriptingEnabled() { var doc = new HTMLDocument(); var parser = new HtmlParser(doc, @"<p id=""status""><noscript><strong>A</strong></noscript><span>B</span></p>"); doc.Options = new DocumentOptions(scripting: true); parser.Parse(); var dochtml0 = doc.ChildNodes[0]; Assert.AreEqual(2, dochtml0.ChildNodes.Length); Assert.AreEqual(0, dochtml0.Attributes.Length); Assert.AreEqual("html", dochtml0.NodeName); Assert.AreEqual(NodeType.Element, dochtml0.NodeType); var dochtml0head0 = dochtml0.ChildNodes[0]; Assert.AreEqual(0, dochtml0head0.ChildNodes.Length); Assert.AreEqual(0, dochtml0head0.Attributes.Length); Assert.AreEqual("head", dochtml0head0.NodeName); Assert.AreEqual(NodeType.Element, dochtml0head0.NodeType); var dochtml0body1 = dochtml0.ChildNodes[1]; Assert.AreEqual(1, dochtml0body1.ChildNodes.Length); Assert.AreEqual(0, dochtml0body1.Attributes.Length); Assert.AreEqual("body", dochtml0body1.NodeName); Assert.AreEqual(NodeType.Element, dochtml0body1.NodeType); var dochtml0body1p0 = dochtml0body1.ChildNodes[0]; Assert.AreEqual(2, dochtml0body1p0.ChildNodes.Length); Assert.AreEqual(1, dochtml0body1p0.Attributes.Length); Assert.AreEqual("p", dochtml0body1p0.NodeName); Assert.AreEqual(NodeType.Element, dochtml0body1p0.NodeType); Assert.AreEqual("status", dochtml0body1p0.Attributes["id"].Value); var dochtml0body1p0noscript0 = dochtml0body1p0.ChildNodes[0]; Assert.AreEqual(1, dochtml0body1p0noscript0.ChildNodes.Length); Assert.AreEqual(0, dochtml0body1p0noscript0.Attributes.Length); Assert.AreEqual("noscript", dochtml0body1p0noscript0.NodeName); Assert.AreEqual(NodeType.Element, dochtml0body1p0noscript0.NodeType); var dochtml0body1p0noscript0Text0 = dochtml0body1p0noscript0.ChildNodes[0]; Assert.AreEqual(NodeType.Text, dochtml0body1p0noscript0Text0.NodeType); Assert.AreEqual("<strong>A</strong>", dochtml0body1p0noscript0Text0.TextContent); var dochtml0body1p0span1 = dochtml0body1p0.ChildNodes[1]; Assert.AreEqual(1, dochtml0body1p0span1.ChildNodes.Length); Assert.AreEqual(0, dochtml0body1p0span1.Attributes.Length); Assert.AreEqual("span", dochtml0body1p0span1.NodeName); Assert.AreEqual(NodeType.Element, dochtml0body1p0span1.NodeType); var dochtml0body1p0span1Text0 = dochtml0body1p0span1.ChildNodes[0]; Assert.AreEqual(NodeType.Text, dochtml0body1p0span1Text0.NodeType); Assert.AreEqual("B", dochtml0body1p0span1Text0.TextContent); }
public void TreeNoScriptWithNoScriptCommentInside() { var doc = new HTMLDocument(); var parser = new HtmlParser(doc, @"<noscript><!--<noscript></noscript>--></noscript>"); doc.Options = new DocumentOptions(scripting: true); parser.Parse(); var dochtml0 = doc.ChildNodes[0]; Assert.AreEqual(2, dochtml0.ChildNodes.Length); Assert.AreEqual(0, dochtml0.Attributes.Length); Assert.AreEqual("html", dochtml0.NodeName); Assert.AreEqual(NodeType.Element, dochtml0.NodeType); var dochtml0head0 = dochtml0.ChildNodes[0]; Assert.AreEqual(1, dochtml0head0.ChildNodes.Length); Assert.AreEqual(0, dochtml0head0.Attributes.Length); Assert.AreEqual("head", dochtml0head0.NodeName); Assert.AreEqual(NodeType.Element, dochtml0head0.NodeType); var dochtml0head0noscript0 = dochtml0head0.ChildNodes[0]; Assert.AreEqual(1, dochtml0head0noscript0.ChildNodes.Length); Assert.AreEqual(0, dochtml0head0noscript0.Attributes.Length); Assert.AreEqual("noscript", dochtml0head0noscript0.NodeName); Assert.AreEqual(NodeType.Element, dochtml0head0noscript0.NodeType); var dochtml0head0noscript0Text0 = dochtml0head0noscript0.ChildNodes[0]; Assert.AreEqual(NodeType.Text, dochtml0head0noscript0Text0.NodeType); Assert.AreEqual("<!--<noscript>", dochtml0head0noscript0Text0.TextContent); var dochtml0body1 = dochtml0.ChildNodes[1]; Assert.AreEqual(1, dochtml0body1.ChildNodes.Length); Assert.AreEqual(0, dochtml0body1.Attributes.Length); Assert.AreEqual("body", dochtml0body1.NodeName); Assert.AreEqual(NodeType.Element, dochtml0body1.NodeType); var dochtml0body1Text0 = dochtml0body1.ChildNodes[0]; Assert.AreEqual(NodeType.Text, dochtml0body1Text0.NodeType); Assert.AreEqual("-->", dochtml0body1Text0.TextContent); }
public HTMLDocument Load(String url) { _location.Href = url; Cookie = new Cookie(); for (int i = _children.Length - 1; i >= 0; i++) RemoveChild(_children[i]); ReadyState = Readiness.Loading; QuirksMode = QuirksMode.Off; var task = Builder.GetFromUrl(url); task.ContinueWith(m => { if (m.IsCompleted && !m.IsFaulted) { var stream = m.Result; var source = new SourceManager(stream); var parser = new HtmlParser(this, source); parser.Parse(); } }); return this; }