Exemple #1
0
        public void BasicWithImagesTest()
        {
            string h         = "<div><p><img src='http://example.com/' alt=Image></p><p><img src='ftp://ftp.example.com'></p></div>";
            string cleanHtml = Dcsoup.Clean(h, Whitelist.BasicWithImages);

            Assert.AreEqual("<p><img src=\"http://example.com/\" alt=\"Image\"></p><p><img></p>", TextUtil.StripNewlines(cleanHtml));
        }
Exemple #2
0
        public void NoImplicitFormForTextAreas()
        {
            // old jsoup parser would create implicit forms for form children like <textarea>, but no more
            Document doc = Dcsoup.Parse("<textarea>One</textarea>");

            Assert.AreEqual("<textarea>One</textarea>", doc.Body.Html);
        }
Exemple #3
0
        public void TestHtmlAndXmlSyntax()
        {
            string   h   = "<!DOCTYPE html><body><img async checked='checked' src='&<>\"'>&lt;&gt;&amp;&quot;<foo />bar";
            Document doc = Dcsoup.Parse(h);

            doc.OutputSettings.Syntax = DocumentSyntax.Html;
            Assert.AreEqual("<!DOCTYPE html>\n" +
                            "<html>\n" +
                            " <head></head>\n" +
                            " <body>\n" +
                            "  <img async checked src=\"&amp;<>&quot;\">&lt;&gt;&amp;\"\n" +
                            "  <foo />bar\n" +
                            " </body>\n" +
                            "</html>", doc.Html);

            doc.OutputSettings.Syntax = DocumentSyntax.Xml;
            Assert.AreEqual("<!DOCTYPE html>\n" +
                            "<html>\n" +
                            " <head></head>\n" +
                            " <body>\n" +
                            "  <img async=\"\" checked=\"checked\" src=\"&amp;<>&quot;\" />&lt;&gt;&amp;\"\n" +
                            "  <foo />bar\n" +
                            " </body>\n" +
                            "</html>", doc.Html);
        }
Exemple #4
0
        public void TestAFlowContents()
        {
            // html5 has <a> as either phrasing or block
            Document doc = Dcsoup.Parse("<a>Hello <div>there</div> <span>now</span></a>");

            Assert.AreEqual("<a>Hello <div>there</div> <span>now</span></a>", TextUtil.StripNewlines(doc.Body.Html));
        }
Exemple #5
0
        public void CommentBeforeHtml()
        {
            string   h   = "<!-- comment --><!-- comment 2 --><p>One</p>";
            Document doc = Dcsoup.Parse(h);

            Assert.AreEqual("<!-- comment --><!-- comment 2 --><html><head></head><body><p>One</p></body></html>", TextUtil.StripNewlines(doc.Html));
        }
Exemple #6
0
        public void NormalisedBodyAfterContent()
        {
            Document doc = Dcsoup.Parse("<font face=Arial><body class=name><div>One</div></body></font>");

            Assert.AreEqual("<html><head></head><body class=\"name\"><font face=\"Arial\"><div>One</div></font></body></html>",
                            TextUtil.StripNewlines(doc.Html));
        }
Exemple #7
0
        public void TestSpanContents()
        {
            // like h1 tags, the spec says SPAN is phrasing only, but browsers and publisher treat span as a block tag
            Document doc = Dcsoup.Parse("<span>Hello <div>there</div> <span>now</span></span>");

            Assert.AreEqual("<span>Hello <div>there</div> <span>now</span></span>", TextUtil.StripNewlines(doc.Body.Html));
        }
Exemple #8
0
        public Document GetAsDoc()
        {
            if (htmlDoc  is null)
            {
                string baseUri = HttpUtil.GetBaseUrl(Endpoint);

                using (MemoryStream memStream = new MemoryStream()) {
                    byte[] buffer = Encoding.UTF8.GetBytes(GetResponseContent());
                    memStream.Write(buffer, 0, buffer.Length);
                    htmlDoc = Dcsoup.Parse(memStream, "UTF-8", baseUri, Parser.HtmlParser);
                }

                /*
                 * try {
                 *      doc = Jsoup.parse(new String(html.getBytes(), "UTF-8"), baseUri);
                 *      doc.outputSettings().charset("UTF-8");
                 *
                 * } catch (UnsupportedEncodingException ex2) {
                 *      ex2.printStackTrace();
                 *              doc = Jsoup.parse(html, baseUri);
                 *      }
                 */
            }
            return(htmlDoc);
        }
Exemple #9
0
        public void HandleAbsOnLocalhostFileUris()
        {
            Document doc = Dcsoup.Parse("<a href='password'>One/a><a href='/var/log/messages'>Two</a>", "file://localhost/etc/");
            Element  one = doc.Select("a").First;

            Assert.AreEqual("file://localhost/etc/password", one.AbsUrl("href"));
        }
Exemple #10
0
        public void TestDropScript()
        {
            string h         = "<SCRIPT SRC=//ha.ckers.org/.j><SCRIPT>alert(/XSS/.source)</SCRIPT>";
            string cleanHtml = Dcsoup.Clean(h, Whitelist.Relaxed);

            Assert.AreEqual("", cleanHtml);
        }
Exemple #11
0
        public void TestDropImageScript()
        {
            string h         = "<IMG SRC=\"javascript:alert('XSS')\">";
            string cleanHtml = Dcsoup.Clean(h, Whitelist.Relaxed);

            Assert.AreEqual("<img>", cleanHtml);
        }
Exemple #12
0
        public void TestDropXmlProc()
        {
            string h         = "<?import namespace=\"xss\"><p>Hello</p>";
            string cleanHtml = Dcsoup.Clean(h, Whitelist.Relaxed);

            Assert.AreEqual("<p>Hello</p>", cleanHtml);
        }
Exemple #13
0
        public void TestDropComments()
        {
            string h         = "<p>Hello<!-- no --></p>";
            string cleanHtml = Dcsoup.Clean(h, Whitelist.Relaxed);

            Assert.AreEqual("<p>Hello</p>", cleanHtml);
        }
Exemple #14
0
        public void TestRelaxed()
        {
            string h         = "<h1>Head</h1><table><tr><td>One<td>Two</td></tr></table>";
            string cleanHtml = Dcsoup.Clean(h, Whitelist.Relaxed);

            Assert.AreEqual("<h1>Head</h1><table><tbody><tr><td>One</td><td>Two</td></tr></tbody></table>", TextUtil.StripNewlines(cleanHtml));
        }
Exemple #15
0
        public void HandlesInvalidStartTags()
        {
            string   h   = "<div>Hello < There <&amp;></div>"; // parse to <div {#text=Hello < There <&>}>
            Document doc = Dcsoup.Parse(h);

            Assert.AreEqual("Hello < There <&>", doc.Select("div").First.Text);
        }
Exemple #16
0
        public void TestWithSupplementaryCharacter()
        {
            Document doc = Dcsoup.Parse(char.ConvertFromUtf32(135361));
            TextNode t   = doc.Body.TextNodes[0];

            Assert.AreEqual(char.ConvertFromUtf32(135361), t.OuterHtml.Trim());
        }
Exemple #17
0
        public void NormalisesHeadlessBody()
        {
            Document doc = Dcsoup.Parse("<html><body><span class=\"foo\">bar</span>");

            Assert.AreEqual("<html><head></head><body><span class=\"foo\">bar</span></body></html>",
                            TextUtil.StripNewlines(doc.Html));
        }
Exemple #18
0
        public void HandlesXmlDeclarationAsBogusComment()
        {
            string   html = "<?xml encoding='UTF-8' ?><body>One</body>";
            Document doc  = Dcsoup.Parse(html);

            Assert.AreEqual("<!--?xml encoding='UTF-8' ?--> <html> <head></head> <body> One </body> </html>", StringUtil.NormaliseWhitespace(doc.OuterHtml));
        }
Exemple #19
0
        public void TestHgroup()
        {
            // jsoup used to not allow hroup in h{n}, but that's not in spec, and browsers are OK
            Document doc = Dcsoup.Parse("<h1>Hello <h2>There <hgroup><h1>Another<h2>headline</hgroup> <hgroup><h1>More</h1><p>stuff</p></hgroup>");

            Assert.AreEqual("<h1>Hello </h1><h2>There <hgroup><h1>Another</h1><h2>headline</h2></hgroup> <hgroup><h1>More</h1><p>stuff</p></hgroup></h2>", TextUtil.StripNewlines(doc.Body.Html));
        }
Exemple #20
0
        public void HandlesTagsInTextarea()
        {
            string   html = "<textarea><p>Jsoup</p></textarea>";
            Document doc  = Dcsoup.Parse(html);

            Assert.AreEqual("<textarea>&lt;p&gt;Jsoup&lt;/p&gt;</textarea>", doc.Body.Html);
        }
Exemple #21
0
        public void TestNoImagesInNoScriptInHead()
        {
            // jsoup used to allow, but against spec if parsing with noscript
            Document doc = Dcsoup.Parse("<html><head><noscript><img src='foo'></noscript></head><body><p>Hello</p></body></html>");

            Assert.AreEqual("<html><head><noscript></noscript></head><body><img src=\"foo\"><p>Hello</p></body></html>", TextUtil.StripNewlines(doc.Html));
        }
Exemple #22
0
        public void HandlesTextAfterData()
        {
            string   h   = "<html><body>pre <script>inner</script> aft</body></html>";
            Document doc = Dcsoup.Parse(h);

            Assert.AreEqual("<html><head></head><body>pre <script>inner</script> aft</body></html>", TextUtil.StripNewlines(doc.Html));
        }
Exemple #23
0
        public void TestFontFlowContents()
        {
            // html5 has no definition of <font>; often used as flow
            Document doc = Dcsoup.Parse("<font>Hello <div>there</div> <span>now</span></font>");

            Assert.AreEqual("<font>Hello <div>there</div> <span>now</span></font>", TextUtil.StripNewlines(doc.Body.Html));
        }
Exemple #24
0
        public void HandlesWhatWgExpensesTableExample()
        {
            // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#examples-0
            Document doc = Dcsoup.Parse("<table> <colgroup> <col> </colgroup><colgroup> <col> <col> <col> </colgroup><thead> <tr> <th> </th><th>2008 </th><th>2007 </th><th>2006 </th></tr></thead><tbody> <tr> <th scope=\"rowgroup\"> Research and development </th><td> $ 1,109 </td><td> $ 782 </td><td> $ 712 </td></tr><tr> <th scope=\"row\"> Percentage of net sales </th><td> 3.4% </td><td> 3.3% </td><td> 3.7% </td></tr></tbody><tbody> <tr> <th scope=\"rowgroup\"> Selling, general, and administrative </th><td> $ 3,761 </td><td> $ 2,963 </td><td> $ 2,433 </td></tr><tr> <th scope=\"row\"> Percentage of net sales </th><td> 11.6% </td><td> 12.3% </td><td> 12.6% </td></tr></tbody></table>");

            Assert.AreEqual("<table> <colgroup> <col> </colgroup><colgroup> <col> <col> <col> </colgroup><thead> <tr> <th> </th><th>2008 </th><th>2007 </th><th>2006 </th></tr></thead><tbody> <tr> <th scope=\"rowgroup\"> Research and development </th><td> $ 1,109 </td><td> $ 782 </td><td> $ 712 </td></tr><tr> <th scope=\"row\"> Percentage of net sales </th><td> 3.4% </td><td> 3.3% </td><td> 3.7% </td></tr></tbody><tbody> <tr> <th scope=\"rowgroup\"> Selling, general, and administrative </th><td> $ 3,761 </td><td> $ 2,963 </td><td> $ 2,433 </td></tr><tr> <th scope=\"row\"> Percentage of net sales </th><td> 11.6% </td><td> 12.3% </td><td> 12.6% </td></tr></tbody></table>", TextUtil.StripNewlines(doc.Body.Html));
        }
Exemple #25
0
        public void EmptyTdTag()
        {
            string   h   = "<table><tr><td>One</td><td id='2' /></tr></table>";
            Document doc = Dcsoup.Parse(h);

            Assert.AreEqual("<td>One</td>\n<td id=\"2\"></td>", doc.Select("tr").First.Html);
        }
Exemple #26
0
        public void NoTableDirectInTable()
        {
            Document doc = Dcsoup.Parse("<table> <td>One <td><table><td>Two</table> <table><td>Three");

            Assert.AreEqual("<table> <tbody><tr><td>One </td><td><table><tbody><tr><td>Two</td></tr></tbody></table> <table><tbody><tr><td>Three</td></tr></tbody></table></td></tr></tbody></table>",
                            TextUtil.StripNewlines(doc.Body.Html));
        }
Exemple #27
0
        public void HandlesCommentsInTable()
        {
            string   html = "<table><tr><td>text</td><!-- Comment --></tr></table>";
            Document node = Dcsoup.ParseBodyFragment(html);

            Assert.AreEqual("<html><head></head><body><table><tbody><tr><td>text</td><!-- Comment --></tr></tbody></table></body></html>", TextUtil.StripNewlines(node.OuterHtml));
        }
Exemple #28
0
        public void IgnoresDupeEndTrTag()
        {
            Document doc = Dcsoup.Parse("<table><tr><td>One</td><td><table><tr><td>Two</td></tr></tr></table></td><td>Three</td></tr></table>"); // two </tr></tr>, must ignore or will close table

            Assert.AreEqual("<table><tbody><tr><td>One</td><td><table><tbody><tr><td>Two</td></tr></tbody></table></td><td>Three</td></tr></tbody></table>",
                            TextUtil.StripNewlines(doc.Body.Html));
        }
Exemple #29
0
        public void TestXhtmlReferences()
        {
            Document doc = Dcsoup.Parse("&lt; &gt; &amp; &quot; &apos; &times;");

            doc.OutputSettings.EscapeMode = DocumentEscapeMode.Xhtml;
            Assert.AreEqual("&lt; &gt; &amp; \" ' ×", doc.Body.Html);
        }
Exemple #30
0
        public void TestScriptTagInWhiteList()
        {
            Whitelist whitelist = Whitelist.Relaxed;

            whitelist.AddTags("script");
            Assert.IsTrue(Dcsoup.IsValid("Hello<script>alert('Doh')</script>World !", whitelist));
        }