Пример #1
0
        public void TestBaidu()
        {
            // tests <meta http-equiv="Content-Type" content="text/html;charset=gb2312">
            string   @in = GetFilePath("/htmltests/baidu-cn-home.html");
            Document doc = Dcsoup.ParseFile(@in, null,
                                            "http://www.baidu.com/"); // http charset is gb2312, but NOT specifying it, to test http-equiv parse
            Element submit = doc.Select("#su").First;

            Assert.AreEqual("百度一下", submit.Attr("value"));

            // test from attribute match
            submit = doc.Select("input[value=百度一下]").First;
            Assert.AreEqual("su", submit.Id);
            Element newsLink = doc.Select("a:Contains(新)").First;

            Assert.AreEqual("http://news.baidu.com/", newsLink.AbsUrl("href")); // trailing slash to domain name

            // check auto-detect from meta
            Assert.AreEqual("gb2312", doc.OutputSettings.Charset.WebName); // charset name is lower case
            Assert.AreEqual("<title>百度一下,你就知道      </title>", doc.Select("title").OuterHtml);

            doc.OutputSettings.Charset = Encoding.ASCII;
            Assert.AreEqual("<title>&#x767e;&#x5ea6;&#x4e00;&#x4e0b;&#xff0c;&#x4f60;&#x5c31;&#x77e5;&#x9053;      </title>",
                            doc.Select("title").OuterHtml);
        }
Пример #2
0
        private bool TestValidProtocol(Element el, Nodes.Attribute attr, ICollection <Whitelist.Protocol> protocols)
        {
            // try to resolve relative urls to abs, and optionally update the attribute so output html has abs.
            // rels without a baseuri get removed
            string value = el.AbsUrl(attr.Key);

            if (value.Length == 0)
            {
                value = attr.Value;
            }
            // if it could not be made abs, run as-is to allow custom unknown protocols
            if (!preserveRelativeLinks)
            {
                attr.Value = value;
            }
            foreach (Whitelist.Protocol protocol in protocols)
            {
                string prot = protocol.ToString() + ":";
                if (value.ToLower().StartsWith(prot, StringComparison.Ordinal))
                {
                    return(true);
                }
            }
            return(false);
        }
Пример #3
0
        public void HandleAbsOnLocalhostFileUris()
        {
            Document doc = Dcsoup.Parse("<a href='password'>One/a><a href='/var/log/messages'>Two</a>", "file://localhost/etc/");
            Element  one = doc.Select("a").First;

            Assert.AreEqual("file://localhost/etc/password", one.AbsUrl("href"));
        }
        public void testBaidu()
        {
            // tests <meta http-equiv="Content-Type" content="text/html;charset=gb2312">
            using (Stream input = getFile("Test.htmltests.baidu-cn-home.html"))
            {
                Document doc = NSoup.NSoupClient.Parse(input, null,
                                                       "http://www.baidu.com/"); // http charset is gb2312, but NOT specifying it, to test http-equiv parse
                Element submit = doc.Select("#su").First;
                Assert.AreEqual("百度一下", submit.Attr("value"));

                // test from attribute match
                submit = doc.Select("input[value=百度一下]").First;
                Assert.AreEqual("su", submit.Id);
                Element newsLink = doc.Select("a:contains(新)").First;
                //Assert.AreEqual("http://news.baidu.com", newsLink.AbsUrl("href"));
                Assert.AreEqual("http://news.baidu.com/", newsLink.AbsUrl("href")); // This was changed due to System.Uri's behavior. I think this change is acceptable.

                // check auto-detect from meta
                Assert.AreEqual("GB2312", doc.OutputSettings().Encoding.WebName.ToUpperInvariant());
                Assert.AreEqual("<title>百度一下,你就知道      </title>", doc.Select("title").OuterHtml());

                doc.OutputSettings().SetEncoding("ascii");
                Assert.AreEqual("<title>&#30334;&#24230;&#19968;&#19979;&#65292;&#20320;&#23601;&#30693;&#36947;      </title>",
                                doc.Select("title").OuterHtml());
            }
        }
Пример #5
0
        private bool TestValidProtocol(Element el, NSoup.Nodes.Attribute attr, HashSet <Protocol> protocols)
        {
            // try to resolve relative urls to abs, and optionally update the attribute so output html has abs.
            // rels without a baseuri get removed
            string value = el.AbsUrl(attr.Key);

            if (value.Length == 0)
            {
                value = attr.Value; // if it could not be made abs, run as-is to allow custom unknown protocols
            }
            if (!_preserveRelativeLinks)
            {
                attr.Value = value;
            }

            foreach (Protocol protocol in protocols)
            {
                string prot = protocol.ToString() + ":";
                if (value.ToLowerInvariant().StartsWith(prot))
                {
                    return(true);
                }
            }
            return(false);
        }
        public void handlesAbsOnImage()
        {
            Document doc = NSoup.NSoupClient.Parse("<p><img src=\"/rez/osi_logo.png\" /></p>", "http://jsoup.org/");
            Element  img = doc.Select("img").First;

            Assert.AreEqual("http://jsoup.org/rez/osi_logo.png", img.Attr("abs:src"));
            Assert.AreEqual(img.AbsUrl("src"), img.Attr("abs:src"));
        }
Пример #7
0
        public void HandlesProtocolRelativeUrl()
        {
            string   @base = "https://example.com/";
            string   html  = "<img src='//example.net/img.jpg'>";
            Document doc   = Dcsoup.Parse(html, @base);
            Element  el    = doc.Select("img").First;

            Assert.AreEqual("https://example.net/img.jpg", el.AbsUrl("src"));
        }
Пример #8
0
        public void HandleAbsOnFileUris()
        {
            Document doc = Dcsoup.Parse("<a href='password'>One/a><a href='/var/log/messages'>Two</a>", "file:///etc/"); // double slash // is required
            Element  one = doc.Select("a").First;

            Assert.AreEqual("file:///etc/password", one.AbsUrl("href")); // double slash // is required
            Element two = doc.Select("a")[1];

            Assert.AreEqual("file:///var/log/messages", two.AbsUrl("href")); // double slash // is required
        }
Пример #9
0
        public void handlesBaseUri()
        {
            Tag tag = Tag.ValueOf("a");
            Attributes attribs = new Attributes();
            attribs.Add("relHref", "/foo");
            attribs.Add("absHref", "http://bar/qux");

            Element noBase = new Element(tag, "", attribs);
            Assert.AreEqual("", noBase.AbsUrl("relHref")); // with no base, should NOT fallback to href attrib, whatever it is
            Assert.AreEqual("http://bar/qux", noBase.AbsUrl("absHref")); // no base but valid attrib, return attrib

            Element withBase = new Element(tag, "http://foo/", attribs);
            Assert.AreEqual("http://foo/foo", withBase.AbsUrl("relHref")); // construct abs from base + rel
            Assert.AreEqual("http://bar/qux", withBase.AbsUrl("absHref")); // href is abs, so returns that
            Assert.AreEqual("", withBase.AbsUrl("noval"));

            Element dodgyBase = new Element(tag, "wtf://no-such-protocol/", attribs);
            Assert.AreEqual("http://bar/qux", dodgyBase.AbsUrl("absHref")); // base fails, but href good, so get that
            Assert.AreEqual("", dodgyBase.AbsUrl("relHref")); // base fails, only rel href, so return nothing 
        }
        public void absHandlesRelativeQuery()
        {
            Document doc = NSoup.NSoupClient.Parse("<a href='?foo'>One</a> <a href='bar.html?foo'>Two</a>", "http://jsoup.org/path/file?bar");

            Element a1 = doc.Select("a").First;

            Assert.AreEqual("http://jsoup.org/path/file?foo", a1.AbsUrl("href"));

            Element a2 = doc.Select("a")[1];

            Assert.AreEqual("http://jsoup.org/path/bar.html?foo", a2.AbsUrl("href"));
        }
Пример #11
0
        public void HandlesAbsOnProtocolessAbsoluteUris()
        {
            Document doc1 = Dcsoup.Parse("<a href='//example.net/foo'>One</a>", "http://example.com/");
            Document doc2 = Dcsoup.Parse("<a href='//example.net/foo'>One</a>", "https://example.com/");

            Element one = doc1.Select("a").First;
            Element two = doc2.Select("a").First;

            Assert.AreEqual("http://example.net/foo", one.AbsUrl("href"));
            Assert.AreEqual("https://example.net/foo", two.AbsUrl("href"));

            Document doc3 = Dcsoup.Parse("<img src=//www.google.com/images/errors/logo_sm.gif alt=Google>", "https://google.com");

            Assert.AreEqual("https://www.google.com/images/errors/logo_sm.gif", doc3.Select("img").Attr("abs:src"));
        }
Пример #12
0
        public void MaybeSetBaseUri(Element baseEl)
        {
            if (_baseUriSetFromDoc) // only listen to the first <base href> in parse
            {
                return;
            }

            string href = baseEl.AbsUrl("href");
            if (href.Length != 0)
            { // ignore <base target> etc
                _baseUri = href;
                _baseUriSetFromDoc = true;
                _doc.BaseUri = href; // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants
            }
        }
        public void handlesBaseUri()
        {
            Tag        tag     = Tag.ValueOf("a");
            Attributes attribs = new Attributes();

            attribs.Add("relHref", "/foo");
            attribs.Add("absHref", "http://bar/qux");

            Element noBase = new Element(tag, "", attribs);

            Assert.AreEqual("", noBase.AbsUrl("relHref"));               // with no base, should NOT fallback to href attrib, whatever it is
            Assert.AreEqual("http://bar/qux", noBase.AbsUrl("absHref")); // no base but valid attrib, return attrib

            Element withBase = new Element(tag, "http://foo/", attribs);

            Assert.AreEqual("http://foo/foo", withBase.AbsUrl("relHref")); // construct abs from base + rel
            Assert.AreEqual("http://bar/qux", withBase.AbsUrl("absHref")); // href is abs, so returns that
            Assert.AreEqual("", withBase.AbsUrl("noval"));

            Element dodgyBase = new Element(tag, "wtf://no-such-protocol/", attribs);

            Assert.AreEqual("http://bar/qux", dodgyBase.AbsUrl("absHref")); // base fails, but href good, so get that
            Assert.AreEqual("", dodgyBase.AbsUrl("relHref"));               // base fails, only rel href, so return nothing
        }
        public void handlesAbsPrefixOnHasAttr()
        {
            // 1: no abs url; 2: has abs url
            Document doc = NSoup.NSoupClient.Parse("<a id=1 href='/foo'>One</a> <a id=2 href='http://jsoup.org/'>Two</a>");
            Element  one = doc.Select("#1").First;
            Element  two = doc.Select("#2").First;

            Assert.IsFalse(one.HasAttr("abs:href"));
            Assert.IsTrue(one.HasAttr("href"));
            Assert.AreEqual("", one.AbsUrl("href"));

            Assert.IsTrue(two.HasAttr("abs:href"));
            Assert.IsTrue(two.HasAttr("href"));
            Assert.AreEqual("http://jsoup.org/", two.AbsUrl("href"));
        }
Пример #15
0
        internal void MaybeSetBaseUri(Element @base)
        {
            if (baseUriSetFromDoc)
            {
                // only listen to the first <base href> in parse
                return;
            }
            string href = @base.AbsUrl("href");

            if (href.Length != 0)
            {
                // ignore <base target> etc
                baseUri           = href;
                baseUriSetFromDoc = true;
                doc.BaseUri       = href;
            }
        }
Пример #16
0
        private bool TestValidProtocol(Element el, NSoup.Nodes.Attribute attr, HashSet<Protocol> protocols)
        {
            // try to resolve relative urls to abs, and optionally update the attribute so output html has abs.
            // rels without a baseuri get removed
            string value = el.AbsUrl(attr.Key);
            if (value.Length == 0)
            {
                value = attr.Value; // if it could not be made abs, run as-is to allow custom unknown protocols
            }
            if (!_preserveRelativeLinks)
            {
                attr.Value = value;
            }

            foreach (Protocol protocol in protocols)
            {
                string prot = protocol.ToString() + ":";
                if (value.ToLowerInvariant().StartsWith(prot))
                {
                    return true;
                }
            }
            return false;
        }
Пример #17
0
        public void MaybeSetBaseUri(Element baseEl)
        {
            if (_baseUriSetFromDoc) // only listen to the first <base href> in parse
            {
                return;
            }

            string href = baseEl.AbsUrl("href");
            if (href.Length != 0)
            { // ignore <base target> etc
                _baseUri = href;
                _baseUriSetFromDoc = true;
                _doc.BaseUri = href; // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants
            }
        }
        //Saves all elements in argument
        public void DownloadElements(Element[] elements, FileInfo[] fileInfo = null)
        {
            int totalDownloaded = 0;
            int totalBounced    = 0;

            //Returns if there are no files to be downloaded
            if (fileInfo != null && fileInfo.Length == 0)
            {
                return;
            }

            for (int i = 0; i < elements.Length; i++)
            {
                Element content = elements[i];

                string absURL;

                string tag = content.Tag.ToString();

                switch (tag)
                {
                case "img":
                    absURL = content.AbsUrl("src");
                    break;

                case "a":
                    absURL = content.AbsUrl("href");
                    break;

                default:
                    absURL = content.AbsUrl("src");
                    break;
                }
                data.mediaTrie.InsertURL(absURL);
                FileInfo file;
                //Doesn't recaculate file info if it doesn't have to
                if (fileInfo == null)
                {
                    int nameIndex = absURL.LastIndexOf('/');
                    //Name of the element
                    string elementName = Regex.Replace(absURL.Substring(nameIndex + 1), "[^A-Za-z.]", "");
                    //File location of the element
                    string elementLocation = absURL.Substring(0, nameIndex);
                    if (elementName.Length > 20)
                    {
                        elementName = elementName.Substring(elementName.Length - 20);
                    }
                    //Inserts hash into filename to avoid duplicates
                    string hashCode = Convert.ToString(content.GetHashCode());
                    elementName = elementName.Insert(0, hashCode);
                    if (!data.gallery)
                    {
                        file = new FileInfo(webStringUtils.UrlToDir(elementLocation) + elementName);
                    }
                    else
                    {
                        file = new FileInfo(data.outputFolder + elementName);
                    }
                }
                else
                {
                    file = fileInfo[i];
                }


                //Defers downloading to the saver
                Save(absURL, file);
                //Sleeps to slow down image requests
                Thread.Sleep(data.delay);
                totalDownloaded++;
            }
            string report = "Downloaded " + totalDownloaded + " media files, denied " + totalBounced;

            CU.WCol(CU.nl + report + CU.nl, CU.c);
        }