public void testHeaderContents() { // h* tags (h1 .. h9) in browsers can handle any internal content other than other h*. which is not per any // spec, which defines them as containing phrasing content only. so, reality over theory. Document doc = NSoupClient.Parse("<h1>Hello <div>There</div> now</h1> <h2>More <h3>Content</h3></h2>"); Assert.AreEqual("<h1>Hello <div>There</div> now</h1> <h2>More </h2><h3>Content</h3>", TextUtil.StripNewLines(doc.Body.Html())); }
public void handlesMisnestedTagsBP() { // whatwg: <b><p></b></p> string h = "<b>1<p>2</b>3</p>"; Document doc = NSoupClient.Parse(h); Assert.AreEqual("<b>1</b>\n<p><b>2</b>3</p>", doc.Body.Html()); }
public void findsCharsetInMalformedMeta() { string h = "<meta http-equiv=Content-Type content=text/html; charset=gb2312>"; // example cited for reason of html5's <meta charset> element Document doc = NSoupClient.Parse(h); Assert.AreEqual("gb2312", doc.Select("meta").Attr("charset")); }
public void normalisesDocument() { string h = "<!doctype html>One<html>Two<head>Three<link></head>Four<body>Five </body>Six </html>Seven "; Document doc = NSoupClient.Parse(h); Assert.AreEqual("<!DOCTYPE html><html><head></head><body>OneTwoThree<link />FourFive Six Seven </body></html>", TextUtil.StripNewLines(doc.Html())); }
public void ignoresContentAfterFrameset() { string h = "<html><head><title>One</title></head><frameset><frame /><frame /></frameset><table></table></html>"; Document doc = NSoupClient.Parse(h); Assert.AreEqual("<html><head><title>One</title></head><frameset><frame /><frame /></frameset></html>", TextUtil.StripNewLines(doc.Html())); // no body, no table. No crash! }
public void reconstructFormattingElements() { // tests attributes and multi b string h = "<p><b class=one>One <i>Two <b>Three</p><p>Hello</p>"; Document doc = NSoupClient.Parse(h); Assert.AreEqual("<p><b class=\"one\">One <i>Two <b>Three</b></i></b></p>\n<p><b class=\"one\"><i><b>Hello</b></i></b></p>", doc.Body.Html()); }
public void dropsUnterminatedAttribute() { // NSoupClient used to parse this to <p id="foo">, but whatwg, webkit will drop. string h1 = "<p id=\"foo"; Document doc = NSoupClient.Parse(h1); Assert.AreEqual("", doc.Text()); }
public void handlesSolidusAtAttributeEnd() { // this test makes sure [<a href=/>link</a>] is parsed as [<a href="/">link</a>], not [<a href="" /><a>link</a>] string h = "<a href=/>link</a>"; Document doc = NSoupClient.Parse(h); Assert.AreEqual("<a href=\"/\">link</a>", doc.Body.Html()); }
public void handlesMultiClosingBody() { string h = "<body><p>Hello</body><p>there</p></body></body></html><p>now"; Document doc = NSoupClient.Parse(h); Assert.AreEqual(3, doc.Select("p").Count); Assert.AreEqual(3, doc.Body.Children.Count); }
public void handlesUnknownNamespaceTags() { // note that the first foo:bar should not really be allowed to be self closing, if parsed in html mode. string h = "<foo:bar id='1' /><abc:def id=2>Foo<p>Hello</p></abc:def><foo:bar>There</foo:bar>"; Document doc = NSoupClient.Parse(h); Assert.AreEqual("<foo:bar id=\"1\" /><abc:def id=\"2\">Foo<p>Hello</p></abc:def><foo:bar>There</foo:bar>", TextUtil.StripNewLines(doc.Body.Html())); }
public void traverse() { Document doc = NSoupClient.Parse("<div><p>Hello</p></div><div>There</div>"); StringBuilder accum = new StringBuilder(); doc.Select("div").First.Traverse(new TestNodeVisitor(accum)); Assert.AreEqual("<div><p><#text></#text></p></div>", accum.ToString()); }
public void handlesTextArea() { Document doc = NSoupClient.Parse("<textarea>Hello</textarea>"); Elements els = doc.Select("textarea"); Assert.AreEqual("Hello", els.Text); Assert.AreEqual("Hello", els.Val()); }
public void handlesMisnestedTagsBI() { // whatwg: <b><i></b></i> string h = "<p>1<b>2<i>3</b>4</i>5</p>"; Document doc = NSoupClient.Parse(h); Assert.AreEqual("<p>1<b>2<i>3</i></b><i>4</i>5</p>", doc.Body.Html()); // adoption agency on </b>, reconstruction of formatters on 4. }
public void handlesBaseWithoutHref() { string h = "<head><base target='_blank'></head><body><a href=/foo>Test</a></body>"; Document doc = NSoupClient.Parse(h, "http://example.com/"); Element a = doc.Select("a").First; Assert.AreEqual("/foo", a.Attr("href")); Assert.AreEqual("http://example.com/foo", a.Attr("abs:href")); }
public void handlesFrames() { string h = "<html><head><script></script><noscript></noscript></head><frameset><frame src=foo></frame><frame src=foo></frameset></html>"; Document doc = NSoupClient.Parse(h); Assert.AreEqual("<html><head><script></script><noscript></noscript></head><frameset><frame src=\"foo\" /><frame src=\"foo\" /></frameset></html>", TextUtil.StripNewLines(doc.Html())); // no body auto vivification }
public void discardsNakedTds() { // NSoupClient used to make this into an implicit table; but browsers make it into a text run string h = "<td>Hello<td><p>There<p>now"; Document doc = NSoupClient.Parse(h); Assert.AreEqual("Hello<p>There</p><p>now</p>", TextUtil.StripNewLines(doc.Body.Html())); // <tbody> is introduced if no implicitly creating table, but allows tr to be directly under table }
public void testClonesDeclarations() { Document doc = NSoupClient.Parse("<!DOCTYPE html><html><head><title>Doctype test"); Document clone = (Document)doc.Clone(); Assert.AreEqual(doc.Html(), clone.Html()); Assert.AreEqual("<!DOCTYPE html><html><head><title>Doctype test</title></head><body></body></html>", TextUtil.StripNewLines(clone.Html())); }
public void handlesWhitespaceInoDocType() { string html = "<!DOCTYPE html\r\n" + " PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\r\n" + " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">"; Document doc = NSoupClient.Parse(html); Assert.AreEqual("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">", doc.ChildNodes[0].OuterHtml()); }
public void createsStructureFromBodySnippet() { // the bar baz stuff naturally goes into the body, but the 'foo' goes into root, and the normalisation routine // needs to move into the start of the body string html = "foo <b>bar</b> baz"; Document doc = NSoupClient.Parse(html); Assert.AreEqual("foo bar baz", doc.Text()); }
public void testSupplyParserToDataStream() { using (Stream input = getFile("Test.htmltests.xml-test.xml")) { Document doc = NSoupClient.Parse(input, null, "http://foo.com", NSoup.Parse.Parser.XmlParser()); Assert.AreEqual("<doc><val>One<val>Two</val>Three</val></doc>", TextUtil.StripNewLines(doc.Html())); } }
public void relaxedBaseEntityMatchAndStrictExtendedMatch() { // extended entities need a ; at the end to match, base does not string html = "& " ® &icy &hopf и 𝕙"; Document doc = NSoupClient.Parse(html); doc.OutputSettings().EscapeMode = Entities.EscapeMode.Extended; // modifies output only to clarify test Assert.AreEqual(StringUtil.NormaliseWhitespace("& " ® &icy &hopf и 𝕙"), doc.Body.Html()); }
public void doesNotFindShortestMatchingEntity() { // previous behaviour was to identify a possible entity, then chomp down the string until a match was found. // (as defined in html5.) However in practise that lead to spurious matches against the author's intent. string html = "One &clubsuite; ♣"; Document doc = NSoupClient.Parse(html); Assert.AreEqual(StringUtil.NormaliseWhitespace("One &clubsuite; ג™£"), doc.Body.Html()); }
public void letterDigitEntities() { string html = "<p>¹²³¼½¾</p>"; Document doc = NSoupClient.Parse(html); Element p = doc.Select("p").First; Assert.AreEqual("¹²³¼½¾", p.Html()); Assert.AreEqual("¹²³¼½¾", p.Text()); }
public void handlesUnexpectedMarkupInTables() { // whatwg - tests markers in active formatting (if they didn't work, would get in in table) // also tests foster parenting string h = "<table><b><tr><td>aaa</td></tr>bbb</table>ccc"; Document doc = NSoupClient.Parse(h); Assert.AreEqual("<b></b><b>bbb</b><table><tbody><tr><td>aaa</td></tr></tbody></table><b>ccc</b>", TextUtil.StripNewLines(doc.Body.Html())); }
public void handlesSpanInTBody() { // test for bug 64 string h = "<table><tbody><span class='1'><tr><td>One</td></tr><tr><td>Two</td></tr></span></tbody></table>"; Document doc = NSoupClient.Parse(h); Assert.AreEqual(doc.Select("span").First.Children.Count, 0); // the span gets closed Assert.AreEqual(doc.Select("table").Count, 1); // only one table }
public void handlesEscapedData() { string html = "<div title='Surf & Turf'>Reef & Beef</div>"; Document doc = NSoupClient.Parse(html); Element div = doc.GetElementsByTag("div")[0]; Assert.AreEqual("Surf & Turf", div.Attr("title")); Assert.AreEqual("Reef & Beef", div.Text()); }
public void wrapDiv() { string h = "<p><b>This</b> is <b>jsoup</b>.</p> <p>How do you like it?</p>"; Document doc = NSoupClient.Parse(h); doc.Select("p").Wrap("<div></div>"); Assert.AreEqual("<div><p><b>This</b> is <b>jsoup</b>.</p></div> <div><p>How do you like it?</p></div>", TextUtil.StripNewLines(doc.Body.Html())); }
public String Down(int index) { String temp = WebHelper.GetHtmlCodeByWebClientWithGzip(this.Items[index].Url, "utf-8"); Document doc = NSoupClient.Parse(temp); Element et = doc.Select("a#fn_pc_download").First; String down = et.Attr("_href"); return(down); }
public void handlesNewLinesAndReturns() { string html = "<a\r\nfoo='bar\r\nqux'\r\nbar\r\n=\r\ntwo>One</a>"; Element el = NSoupClient.Parse(html).Select("a").First; Assert.AreEqual(2, el.Attributes.Count); Assert.AreEqual("bar\r\nqux", el.Attr("foo")); // currently preserves newlines in quoted attributes. todo confirm if should. Assert.AreEqual("two", el.Attr("bar")); }
public void parsesUnterminatedTextarea() { // don't parse right to end, but break on <p> Document doc = NSoupClient.Parse("<body><p><textarea>one<p>two"); Element t = doc.Select("textarea").First; Assert.AreEqual("one", t.Text()); Assert.AreEqual("two", doc.Select("p")[1].Text()); }