示例#1
0
        public virtual void HandlesNewLinesAndReturns()
        {
            String html = "<a\r\nfoo='bar\r\nqux'\r\nbar\r\n=\r\ntwo>One</a>";

            iText.StyledXmlParser.Jsoup.Nodes.Element el = iText.StyledXmlParser.Jsoup.Jsoup.Parse(html).Select("a").First
                                                               ();
            NUnit.Framework.Assert.AreEqual(2, el.Attributes().Size());
            NUnit.Framework.Assert.AreEqual("bar\r\nqux", el.Attr("foo"));
            NUnit.Framework.Assert.AreEqual("two", el.Attr("bar"));
        }
示例#2
0
        public virtual void TestNewsHomepage()
        {
            FileInfo @in = iText.StyledXmlParser.Jsoup.PortTestUtil.GetFile("/htmltests/news-com-au-home.html");
            Document doc = iText.StyledXmlParser.Jsoup.Jsoup.Parse(@in, "UTF-8", "http://www.news.com.au/");

            NUnit.Framework.Assert.AreEqual("News.com.au | News from Australia and around the world online | NewsComAu"
                                            , doc.Title());
            NUnit.Framework.Assert.AreEqual("Brace yourself for Metro meltdown", doc.Select(".id1225817868581 h4").Text
                                                ().Trim());
            iText.StyledXmlParser.Jsoup.Nodes.Element a = doc.Select("a[href=/entertainment/horoscopes]").First();
            NUnit.Framework.Assert.AreEqual("/entertainment/horoscopes", a.Attr("href"));
            NUnit.Framework.Assert.AreEqual("http://www.news.com.au/entertainment/horoscopes", a.Attr("abs:href"));
            iText.StyledXmlParser.Jsoup.Nodes.Element hs = doc.Select("a[href*=naughty-corners-are-a-bad-idea]").First
                                                               ();
            NUnit.Framework.Assert.AreEqual("http://www.heraldsun.com.au/news/naughty-corners-are-a-bad-idea-for-kids/story-e6frf7jo-1225817899003"
                                            , hs.Attr("href"));
            NUnit.Framework.Assert.AreEqual(hs.Attr("href"), hs.Attr("abs:href"));
        }
示例#3
0
        public virtual void ParsesBooleanAttributes()
        {
            String html = "<a normal=\"123\" boolean empty=\"\"></a>";

            iText.StyledXmlParser.Jsoup.Nodes.Element el = iText.StyledXmlParser.Jsoup.Jsoup.Parse(html).Select("a").First
                                                               ();
            NUnit.Framework.Assert.AreEqual("123", el.Attr("normal"));
            NUnit.Framework.Assert.AreEqual("", el.Attr("boolean"));
            NUnit.Framework.Assert.AreEqual("", el.Attr("empty"));
            IList <iText.StyledXmlParser.Jsoup.Nodes.Attribute> attributes = el.Attributes().AsList();

            NUnit.Framework.Assert.AreEqual(3, attributes.Count, "There should be 3 attribute present");
            // Assuming the list order always follows the parsed html
            NUnit.Framework.Assert.IsFalse(attributes[0] is BooleanAttribute, "'normal' attribute should not be boolean"
                                           );
            NUnit.Framework.Assert.IsTrue(attributes[1] is BooleanAttribute, "'boolean' attribute should be boolean");
            NUnit.Framework.Assert.IsFalse(attributes[2] is BooleanAttribute, "'empty' attribute should not be boolean"
                                           );
            NUnit.Framework.Assert.AreEqual(html, el.OuterHtml());
        }
示例#4
0
        public virtual void TestYahooJp()
        {
            FileInfo @in = iText.StyledXmlParser.Jsoup.PortTestUtil.GetFile("/htmltests/yahoo-jp.html");
            Document doc = iText.StyledXmlParser.Jsoup.Jsoup.Parse(@in, "UTF-8", "http://www.yahoo.co.jp/index.html");

            // http charset is utf-8.
            NUnit.Framework.Assert.AreEqual("Yahoo! JAPAN", doc.Title());
            iText.StyledXmlParser.Jsoup.Nodes.Element a = doc.Select("a[href=t/2322m2]").First();
            NUnit.Framework.Assert.AreEqual("http://www.yahoo.co.jp/_ylh=X3oDMTB0NWxnaGxsBF9TAzIwNzcyOTYyNjUEdGlkAzEyBHRtcGwDZ2Ex/t/2322m2"
                                            , a.Attr("abs:href"));
            // session put into <base>
            NUnit.Framework.Assert.AreEqual("全国、人気の駅ランキング", a.Text());
        }
示例#5
0
        public virtual void TestBaidu()
        {
            // tests <meta http-equiv="Content-Type" content="text/html;charset=gb2312">
            FileInfo @in = iText.StyledXmlParser.Jsoup.PortTestUtil.GetFile("/htmltests/baidu-cn-home.html");
            Document doc = iText.StyledXmlParser.Jsoup.Jsoup.Parse(@in, null, "http://www.baidu.com");

            // http charset is gb2312, but NOT specifying it, to test http-equiv parse
            iText.StyledXmlParser.Jsoup.Nodes.Element submit = doc.Select("#su").First();
            NUnit.Framework.Assert.AreEqual("百度一下", submit.Attr("value"));
            // test from attribute match
            submit = doc.Select("input[value=百度一下]").First();
            NUnit.Framework.Assert.AreEqual("su", submit.Id());
            iText.StyledXmlParser.Jsoup.Nodes.Element newsLink = doc.Select("a:contains(新)").First();
            NUnit.Framework.Assert.AreEqual(newsHref, newsLink.AbsUrl("href"));
            // check auto-detect from meta
            NUnit.Framework.Assert.AreEqual("GB2312", doc.OutputSettings().Charset().DisplayName());
            NUnit.Framework.Assert.AreEqual("<title>百度一下,你就知道      </title>", doc.Select("title").OuterHtml());
            doc.OutputSettings().Charset("ascii");
            NUnit.Framework.Assert.AreEqual("<title>&#x767e;&#x5ea6;&#x4e00;&#x4e0b;&#xff0c;&#x4f60;&#x5c31;&#x77e5;&#x9053;      </title>"
                                            , doc.Select("title").OuterHtml());
        }
示例#6
0
 public override bool Matches(iText.StyledXmlParser.Jsoup.Nodes.Element root, iText.StyledXmlParser.Jsoup.Nodes.Element
                              element)
 {
     return(element.HasAttr(key) && PortUtil.HasMatch(pattern, element.Attr(key)));
 }
示例#7
0
 public override bool Matches(iText.StyledXmlParser.Jsoup.Nodes.Element root, iText.StyledXmlParser.Jsoup.Nodes.Element
                              element)
 {
     return(element.HasAttr(key) && element.Attr(key).ToLowerInvariant().Contains(value));
 }
示例#8
0
 public override bool Matches(iText.StyledXmlParser.Jsoup.Nodes.Element root, iText.StyledXmlParser.Jsoup.Nodes.Element
                              element)
 {
     return(!value.EqualsIgnoreCase(element.Attr(key)));
 }
示例#9
0
        // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
        // switching the chartset midstream when a meta http-equiv tag defines the charset.
        // todo - this is getting gnarly. needs a rewrite.
        internal static Document ParseByteData(ByteBuffer byteData, String charsetName, String baseUri, iText.StyledXmlParser.Jsoup.Parser.Parser
                                               parser)
        {
            String   docData;
            Document doc = null;

            // look for BOM - overrides any other header or input
            charsetName = DetectCharsetFromBom(byteData, charsetName);
            if (charsetName == null)
            {
                // determine from meta. safe first parse as UTF-8
                // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
                docData = EncodingUtil.GetEncoding(defaultCharset).Decode(byteData).ToString();
                doc     = parser.ParseInput(docData, baseUri);
                iText.StyledXmlParser.Jsoup.Nodes.Element meta = doc.Select("meta[http-equiv=content-type], meta[charset]"
                                                                            ).First();
                String foundCharset = null;
                // if not found, will keep utf-8 as best attempt
                if (meta != null)
                {
                    if (meta.HasAttr("http-equiv"))
                    {
                        foundCharset = GetCharsetFromContentType(meta.Attr("content"));
                    }
                    if (foundCharset == null && meta.HasAttr("charset"))
                    {
                        foundCharset = meta.Attr("charset");
                    }
                }
                // look for <?xml encoding='ISO-8859-1'?>
                if (foundCharset == null && doc.ChildNode(0) is XmlDeclaration)
                {
                    XmlDeclaration prolog = (XmlDeclaration)doc.ChildNode(0);
                    if (prolog.Name().Equals("xml"))
                    {
                        foundCharset = prolog.Attr("encoding");
                    }
                }
                foundCharset = ValidateCharset(foundCharset);
                if (foundCharset != null && !foundCharset.Equals(defaultCharset))
                {
                    // need to re-decode
                    foundCharset = iText.IO.Util.StringUtil.ReplaceAll(foundCharset.Trim(), "[\"']", "");
                    charsetName  = foundCharset;
                    byteData.Rewind();
                    docData = EncodingUtil.GetEncoding(foundCharset).Decode(byteData).ToString();
                    doc     = null;
                }
            }
            else
            {
                // specified by content type header (or by user on file load)
                Validate.NotEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"
                                  );
                docData = EncodingUtil.GetEncoding(charsetName).Decode(byteData).ToString();
            }
            if (doc == null)
            {
                doc = parser.ParseInput(docData, baseUri);
                doc.OutputSettings().Charset(charsetName);
            }
            return(doc);
        }