public void BasicWithImagesTest() { string h = "<div><p><img src='http://example.com/' alt=Image></p><p><img src='ftp://ftp.example.com'></p></div>"; string cleanHtml = Dcsoup.Clean(h, Whitelist.BasicWithImages); Assert.AreEqual("<p><img src=\"http://example.com/\" alt=\"Image\"></p><p><img></p>", TextUtil.StripNewlines(cleanHtml)); }
public void NoImplicitFormForTextAreas() { // old jsoup parser would create implicit forms for form children like <textarea>, but no more Document doc = Dcsoup.Parse("<textarea>One</textarea>"); Assert.AreEqual("<textarea>One</textarea>", doc.Body.Html); }
public void TestHtmlAndXmlSyntax() { string h = "<!DOCTYPE html><body><img async checked='checked' src='&<>\"'><>&"<foo />bar"; Document doc = Dcsoup.Parse(h); doc.OutputSettings.Syntax = DocumentSyntax.Html; Assert.AreEqual("<!DOCTYPE html>\n" + "<html>\n" + " <head></head>\n" + " <body>\n" + " <img async checked src=\"&<>"\"><>&\"\n" + " <foo />bar\n" + " </body>\n" + "</html>", doc.Html); doc.OutputSettings.Syntax = DocumentSyntax.Xml; Assert.AreEqual("<!DOCTYPE html>\n" + "<html>\n" + " <head></head>\n" + " <body>\n" + " <img async=\"\" checked=\"checked\" src=\"&<>"\" /><>&\"\n" + " <foo />bar\n" + " </body>\n" + "</html>", doc.Html); }
public void TestAFlowContents() { // html5 has <a> as either phrasing or block Document doc = Dcsoup.Parse("<a>Hello <div>there</div> <span>now</span></a>"); Assert.AreEqual("<a>Hello <div>there</div> <span>now</span></a>", TextUtil.StripNewlines(doc.Body.Html)); }
public void CommentBeforeHtml() { string h = "<!-- comment --><!-- comment 2 --><p>One</p>"; Document doc = Dcsoup.Parse(h); Assert.AreEqual("<!-- comment --><!-- comment 2 --><html><head></head><body><p>One</p></body></html>", TextUtil.StripNewlines(doc.Html)); }
public void NormalisedBodyAfterContent() { Document doc = Dcsoup.Parse("<font face=Arial><body class=name><div>One</div></body></font>"); Assert.AreEqual("<html><head></head><body class=\"name\"><font face=\"Arial\"><div>One</div></font></body></html>", TextUtil.StripNewlines(doc.Html)); }
public void TestSpanContents() { // like h1 tags, the spec says SPAN is phrasing only, but browsers and publisher treat span as a block tag Document doc = Dcsoup.Parse("<span>Hello <div>there</div> <span>now</span></span>"); Assert.AreEqual("<span>Hello <div>there</div> <span>now</span></span>", TextUtil.StripNewlines(doc.Body.Html)); }
public Document GetAsDoc() { if (htmlDoc is null) { string baseUri = HttpUtil.GetBaseUrl(Endpoint); using (MemoryStream memStream = new MemoryStream()) { byte[] buffer = Encoding.UTF8.GetBytes(GetResponseContent()); memStream.Write(buffer, 0, buffer.Length); htmlDoc = Dcsoup.Parse(memStream, "UTF-8", baseUri, Parser.HtmlParser); } /* * try { * doc = Jsoup.parse(new String(html.getBytes(), "UTF-8"), baseUri); * doc.outputSettings().charset("UTF-8"); * * } catch (UnsupportedEncodingException ex2) { * ex2.printStackTrace(); * doc = Jsoup.parse(html, baseUri); * } */ } return(htmlDoc); }
public void HandleAbsOnLocalhostFileUris() { Document doc = Dcsoup.Parse("<a href='password'>One/a><a href='/var/log/messages'>Two</a>", "file://localhost/etc/"); Element one = doc.Select("a").First; Assert.AreEqual("file://localhost/etc/password", one.AbsUrl("href")); }
public void TestDropScript() { string h = "<SCRIPT SRC=//ha.ckers.org/.j><SCRIPT>alert(/XSS/.source)</SCRIPT>"; string cleanHtml = Dcsoup.Clean(h, Whitelist.Relaxed); Assert.AreEqual("", cleanHtml); }
public void TestDropImageScript() { string h = "<IMG SRC=\"javascript:alert('XSS')\">"; string cleanHtml = Dcsoup.Clean(h, Whitelist.Relaxed); Assert.AreEqual("<img>", cleanHtml); }
public void TestDropXmlProc() { string h = "<?import namespace=\"xss\"><p>Hello</p>"; string cleanHtml = Dcsoup.Clean(h, Whitelist.Relaxed); Assert.AreEqual("<p>Hello</p>", cleanHtml); }
public void TestDropComments() { string h = "<p>Hello<!-- no --></p>"; string cleanHtml = Dcsoup.Clean(h, Whitelist.Relaxed); Assert.AreEqual("<p>Hello</p>", cleanHtml); }
public void TestRelaxed() { string h = "<h1>Head</h1><table><tr><td>One<td>Two</td></tr></table>"; string cleanHtml = Dcsoup.Clean(h, Whitelist.Relaxed); Assert.AreEqual("<h1>Head</h1><table><tbody><tr><td>One</td><td>Two</td></tr></tbody></table>", TextUtil.StripNewlines(cleanHtml)); }
public void HandlesInvalidStartTags() { string h = "<div>Hello < There <&></div>"; // parse to <div {#text=Hello < There <&>}> Document doc = Dcsoup.Parse(h); Assert.AreEqual("Hello < There <&>", doc.Select("div").First.Text); }
public void TestWithSupplementaryCharacter() { Document doc = Dcsoup.Parse(char.ConvertFromUtf32(135361)); TextNode t = doc.Body.TextNodes[0]; Assert.AreEqual(char.ConvertFromUtf32(135361), t.OuterHtml.Trim()); }
public void NormalisesHeadlessBody() { Document doc = Dcsoup.Parse("<html><body><span class=\"foo\">bar</span>"); Assert.AreEqual("<html><head></head><body><span class=\"foo\">bar</span></body></html>", TextUtil.StripNewlines(doc.Html)); }
public void HandlesXmlDeclarationAsBogusComment() { string html = "<?xml encoding='UTF-8' ?><body>One</body>"; Document doc = Dcsoup.Parse(html); Assert.AreEqual("<!--?xml encoding='UTF-8' ?--> <html> <head></head> <body> One </body> </html>", StringUtil.NormaliseWhitespace(doc.OuterHtml)); }
public void TestHgroup() { // jsoup used to not allow hroup in h{n}, but that's not in spec, and browsers are OK Document doc = Dcsoup.Parse("<h1>Hello <h2>There <hgroup><h1>Another<h2>headline</hgroup> <hgroup><h1>More</h1><p>stuff</p></hgroup>"); Assert.AreEqual("<h1>Hello </h1><h2>There <hgroup><h1>Another</h1><h2>headline</h2></hgroup> <hgroup><h1>More</h1><p>stuff</p></hgroup></h2>", TextUtil.StripNewlines(doc.Body.Html)); }
public void HandlesTagsInTextarea() { string html = "<textarea><p>Jsoup</p></textarea>"; Document doc = Dcsoup.Parse(html); Assert.AreEqual("<textarea><p>Jsoup</p></textarea>", doc.Body.Html); }
public void TestNoImagesInNoScriptInHead() { // jsoup used to allow, but against spec if parsing with noscript Document doc = Dcsoup.Parse("<html><head><noscript><img src='foo'></noscript></head><body><p>Hello</p></body></html>"); Assert.AreEqual("<html><head><noscript></noscript></head><body><img src=\"foo\"><p>Hello</p></body></html>", TextUtil.StripNewlines(doc.Html)); }
public void HandlesTextAfterData() { string h = "<html><body>pre <script>inner</script> aft</body></html>"; Document doc = Dcsoup.Parse(h); Assert.AreEqual("<html><head></head><body>pre <script>inner</script> aft</body></html>", TextUtil.StripNewlines(doc.Html)); }
public void TestFontFlowContents() { // html5 has no definition of <font>; often used as flow Document doc = Dcsoup.Parse("<font>Hello <div>there</div> <span>now</span></font>"); Assert.AreEqual("<font>Hello <div>there</div> <span>now</span></font>", TextUtil.StripNewlines(doc.Body.Html)); }
public void HandlesWhatWgExpensesTableExample() { // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#examples-0 Document doc = Dcsoup.Parse("<table> <colgroup> <col> </colgroup><colgroup> <col> <col> <col> </colgroup><thead> <tr> <th> </th><th>2008 </th><th>2007 </th><th>2006 </th></tr></thead><tbody> <tr> <th scope=\"rowgroup\"> Research and development </th><td> $ 1,109 </td><td> $ 782 </td><td> $ 712 </td></tr><tr> <th scope=\"row\"> Percentage of net sales </th><td> 3.4% </td><td> 3.3% </td><td> 3.7% </td></tr></tbody><tbody> <tr> <th scope=\"rowgroup\"> Selling, general, and administrative </th><td> $ 3,761 </td><td> $ 2,963 </td><td> $ 2,433 </td></tr><tr> <th scope=\"row\"> Percentage of net sales </th><td> 11.6% </td><td> 12.3% </td><td> 12.6% </td></tr></tbody></table>"); Assert.AreEqual("<table> <colgroup> <col> </colgroup><colgroup> <col> <col> <col> </colgroup><thead> <tr> <th> </th><th>2008 </th><th>2007 </th><th>2006 </th></tr></thead><tbody> <tr> <th scope=\"rowgroup\"> Research and development </th><td> $ 1,109 </td><td> $ 782 </td><td> $ 712 </td></tr><tr> <th scope=\"row\"> Percentage of net sales </th><td> 3.4% </td><td> 3.3% </td><td> 3.7% </td></tr></tbody><tbody> <tr> <th scope=\"rowgroup\"> Selling, general, and administrative </th><td> $ 3,761 </td><td> $ 2,963 </td><td> $ 2,433 </td></tr><tr> <th scope=\"row\"> Percentage of net sales </th><td> 11.6% </td><td> 12.3% </td><td> 12.6% </td></tr></tbody></table>", TextUtil.StripNewlines(doc.Body.Html)); }
public void EmptyTdTag() { string h = "<table><tr><td>One</td><td id='2' /></tr></table>"; Document doc = Dcsoup.Parse(h); Assert.AreEqual("<td>One</td>\n<td id=\"2\"></td>", doc.Select("tr").First.Html); }
public void NoTableDirectInTable() { Document doc = Dcsoup.Parse("<table> <td>One <td><table><td>Two</table> <table><td>Three"); Assert.AreEqual("<table> <tbody><tr><td>One </td><td><table><tbody><tr><td>Two</td></tr></tbody></table> <table><tbody><tr><td>Three</td></tr></tbody></table></td></tr></tbody></table>", TextUtil.StripNewlines(doc.Body.Html)); }
public void HandlesCommentsInTable() { string html = "<table><tr><td>text</td><!-- Comment --></tr></table>"; Document node = Dcsoup.ParseBodyFragment(html); Assert.AreEqual("<html><head></head><body><table><tbody><tr><td>text</td><!-- Comment --></tr></tbody></table></body></html>", TextUtil.StripNewlines(node.OuterHtml)); }
public void IgnoresDupeEndTrTag() { Document doc = Dcsoup.Parse("<table><tr><td>One</td><td><table><tr><td>Two</td></tr></tr></table></td><td>Three</td></tr></table>"); // two </tr></tr>, must ignore or will close table Assert.AreEqual("<table><tbody><tr><td>One</td><td><table><tbody><tr><td>Two</td></tr></tbody></table></td><td>Three</td></tr></tbody></table>", TextUtil.StripNewlines(doc.Body.Html)); }
public void TestXhtmlReferences() { Document doc = Dcsoup.Parse("< > & " ' ×"); doc.OutputSettings.EscapeMode = DocumentEscapeMode.Xhtml; Assert.AreEqual("< > & \" ' ×", doc.Body.Html); }
public void TestScriptTagInWhiteList() { Whitelist whitelist = Whitelist.Relaxed; whitelist.AddTags("script"); Assert.IsTrue(Dcsoup.IsValid("Hello<script>alert('Doh')</script>World !", whitelist)); }