public void TestBaidu() { // tests <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> string @in = GetFilePath("/htmltests/baidu-cn-home.html"); Document doc = Dcsoup.ParseFile(@in, null, "http://www.baidu.com/"); // http charset is gb2312, but NOT specifying it, to test http-equiv parse Element submit = doc.Select("#su").First; Assert.AreEqual("百度一下", submit.Attr("value")); // test from attribute match submit = doc.Select("input[value=百度一下]").First; Assert.AreEqual("su", submit.Id); Element newsLink = doc.Select("a:Contains(新)").First; Assert.AreEqual("http://news.baidu.com/", newsLink.AbsUrl("href")); // trailing slash to domain name // check auto-detect from meta Assert.AreEqual("gb2312", doc.OutputSettings.Charset.WebName); // charset name is lower case Assert.AreEqual("<title>百度一下,你就知道 </title>", doc.Select("title").OuterHtml); doc.OutputSettings.Charset = Encoding.ASCII; Assert.AreEqual("<title>百度一下,你就知道 </title>", doc.Select("title").OuterHtml); }
private bool TestValidProtocol(Element el, Nodes.Attribute attr, ICollection <Whitelist.Protocol> protocols) { // try to resolve relative urls to abs, and optionally update the attribute so output html has abs. // rels without a baseuri get removed string value = el.AbsUrl(attr.Key); if (value.Length == 0) { value = attr.Value; } // if it could not be made abs, run as-is to allow custom unknown protocols if (!preserveRelativeLinks) { attr.Value = value; } foreach (Whitelist.Protocol protocol in protocols) { string prot = protocol.ToString() + ":"; if (value.ToLower().StartsWith(prot, StringComparison.Ordinal)) { return(true); } } return(false); }
public void HandleAbsOnLocalhostFileUris() { Document doc = Dcsoup.Parse("<a href='password'>One/a><a href='/var/log/messages'>Two</a>", "file://localhost/etc/"); Element one = doc.Select("a").First; Assert.AreEqual("file://localhost/etc/password", one.AbsUrl("href")); }
public void testBaidu() { // tests <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> using (Stream input = getFile("Test.htmltests.baidu-cn-home.html")) { Document doc = NSoup.NSoupClient.Parse(input, null, "http://www.baidu.com/"); // http charset is gb2312, but NOT specifying it, to test http-equiv parse Element submit = doc.Select("#su").First; Assert.AreEqual("百度一下", submit.Attr("value")); // test from attribute match submit = doc.Select("input[value=百度一下]").First; Assert.AreEqual("su", submit.Id); Element newsLink = doc.Select("a:contains(新)").First; //Assert.AreEqual("http://news.baidu.com", newsLink.AbsUrl("href")); Assert.AreEqual("http://news.baidu.com/", newsLink.AbsUrl("href")); // This was changed due to System.Uri's behavior. I think this change is acceptable. // check auto-detect from meta Assert.AreEqual("GB2312", doc.OutputSettings().Encoding.WebName.ToUpperInvariant()); Assert.AreEqual("<title>百度一下,你就知道 </title>", doc.Select("title").OuterHtml()); doc.OutputSettings().SetEncoding("ascii"); Assert.AreEqual("<title>百度一下,你就知道 </title>", doc.Select("title").OuterHtml()); } }
private bool TestValidProtocol(Element el, NSoup.Nodes.Attribute attr, HashSet <Protocol> protocols) { // try to resolve relative urls to abs, and optionally update the attribute so output html has abs. // rels without a baseuri get removed string value = el.AbsUrl(attr.Key); if (value.Length == 0) { value = attr.Value; // if it could not be made abs, run as-is to allow custom unknown protocols } if (!_preserveRelativeLinks) { attr.Value = value; } foreach (Protocol protocol in protocols) { string prot = protocol.ToString() + ":"; if (value.ToLowerInvariant().StartsWith(prot)) { return(true); } } return(false); }
public void handlesAbsOnImage() { Document doc = NSoup.NSoupClient.Parse("<p><img src=\"/rez/osi_logo.png\" /></p>", "http://jsoup.org/"); Element img = doc.Select("img").First; Assert.AreEqual("http://jsoup.org/rez/osi_logo.png", img.Attr("abs:src")); Assert.AreEqual(img.AbsUrl("src"), img.Attr("abs:src")); }
public void HandlesProtocolRelativeUrl() { string @base = "https://example.com/"; string html = "<img src='//example.net/img.jpg'>"; Document doc = Dcsoup.Parse(html, @base); Element el = doc.Select("img").First; Assert.AreEqual("https://example.net/img.jpg", el.AbsUrl("src")); }
public void HandleAbsOnFileUris() { Document doc = Dcsoup.Parse("<a href='password'>One/a><a href='/var/log/messages'>Two</a>", "file:///etc/"); // double slash // is required Element one = doc.Select("a").First; Assert.AreEqual("file:///etc/password", one.AbsUrl("href")); // double slash // is required Element two = doc.Select("a")[1]; Assert.AreEqual("file:///var/log/messages", two.AbsUrl("href")); // double slash // is required }
public void handlesBaseUri() { Tag tag = Tag.ValueOf("a"); Attributes attribs = new Attributes(); attribs.Add("relHref", "/foo"); attribs.Add("absHref", "http://bar/qux"); Element noBase = new Element(tag, "", attribs); Assert.AreEqual("", noBase.AbsUrl("relHref")); // with no base, should NOT fallback to href attrib, whatever it is Assert.AreEqual("http://bar/qux", noBase.AbsUrl("absHref")); // no base but valid attrib, return attrib Element withBase = new Element(tag, "http://foo/", attribs); Assert.AreEqual("http://foo/foo", withBase.AbsUrl("relHref")); // construct abs from base + rel Assert.AreEqual("http://bar/qux", withBase.AbsUrl("absHref")); // href is abs, so returns that Assert.AreEqual("", withBase.AbsUrl("noval")); Element dodgyBase = new Element(tag, "wtf://no-such-protocol/", attribs); Assert.AreEqual("http://bar/qux", dodgyBase.AbsUrl("absHref")); // base fails, but href good, so get that Assert.AreEqual("", dodgyBase.AbsUrl("relHref")); // base fails, only rel href, so return nothing }
public void absHandlesRelativeQuery() { Document doc = NSoup.NSoupClient.Parse("<a href='?foo'>One</a> <a href='bar.html?foo'>Two</a>", "http://jsoup.org/path/file?bar"); Element a1 = doc.Select("a").First; Assert.AreEqual("http://jsoup.org/path/file?foo", a1.AbsUrl("href")); Element a2 = doc.Select("a")[1]; Assert.AreEqual("http://jsoup.org/path/bar.html?foo", a2.AbsUrl("href")); }
public void HandlesAbsOnProtocolessAbsoluteUris() { Document doc1 = Dcsoup.Parse("<a href='//example.net/foo'>One</a>", "http://example.com/"); Document doc2 = Dcsoup.Parse("<a href='//example.net/foo'>One</a>", "https://example.com/"); Element one = doc1.Select("a").First; Element two = doc2.Select("a").First; Assert.AreEqual("http://example.net/foo", one.AbsUrl("href")); Assert.AreEqual("https://example.net/foo", two.AbsUrl("href")); Document doc3 = Dcsoup.Parse("<img src=//www.google.com/images/errors/logo_sm.gif alt=Google>", "https://google.com"); Assert.AreEqual("https://www.google.com/images/errors/logo_sm.gif", doc3.Select("img").Attr("abs:src")); }
public void MaybeSetBaseUri(Element baseEl) { if (_baseUriSetFromDoc) // only listen to the first <base href> in parse { return; } string href = baseEl.AbsUrl("href"); if (href.Length != 0) { // ignore <base target> etc _baseUri = href; _baseUriSetFromDoc = true; _doc.BaseUri = href; // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants } }
public void handlesBaseUri() { Tag tag = Tag.ValueOf("a"); Attributes attribs = new Attributes(); attribs.Add("relHref", "/foo"); attribs.Add("absHref", "http://bar/qux"); Element noBase = new Element(tag, "", attribs); Assert.AreEqual("", noBase.AbsUrl("relHref")); // with no base, should NOT fallback to href attrib, whatever it is Assert.AreEqual("http://bar/qux", noBase.AbsUrl("absHref")); // no base but valid attrib, return attrib Element withBase = new Element(tag, "http://foo/", attribs); Assert.AreEqual("http://foo/foo", withBase.AbsUrl("relHref")); // construct abs from base + rel Assert.AreEqual("http://bar/qux", withBase.AbsUrl("absHref")); // href is abs, so returns that Assert.AreEqual("", withBase.AbsUrl("noval")); Element dodgyBase = new Element(tag, "wtf://no-such-protocol/", attribs); Assert.AreEqual("http://bar/qux", dodgyBase.AbsUrl("absHref")); // base fails, but href good, so get that Assert.AreEqual("", dodgyBase.AbsUrl("relHref")); // base fails, only rel href, so return nothing }
public void handlesAbsPrefixOnHasAttr() { // 1: no abs url; 2: has abs url Document doc = NSoup.NSoupClient.Parse("<a id=1 href='/foo'>One</a> <a id=2 href='http://jsoup.org/'>Two</a>"); Element one = doc.Select("#1").First; Element two = doc.Select("#2").First; Assert.IsFalse(one.HasAttr("abs:href")); Assert.IsTrue(one.HasAttr("href")); Assert.AreEqual("", one.AbsUrl("href")); Assert.IsTrue(two.HasAttr("abs:href")); Assert.IsTrue(two.HasAttr("href")); Assert.AreEqual("http://jsoup.org/", two.AbsUrl("href")); }
internal void MaybeSetBaseUri(Element @base) { if (baseUriSetFromDoc) { // only listen to the first <base href> in parse return; } string href = @base.AbsUrl("href"); if (href.Length != 0) { // ignore <base target> etc baseUri = href; baseUriSetFromDoc = true; doc.BaseUri = href; } }
private bool TestValidProtocol(Element el, NSoup.Nodes.Attribute attr, HashSet<Protocol> protocols) { // try to resolve relative urls to abs, and optionally update the attribute so output html has abs. // rels without a baseuri get removed string value = el.AbsUrl(attr.Key); if (value.Length == 0) { value = attr.Value; // if it could not be made abs, run as-is to allow custom unknown protocols } if (!_preserveRelativeLinks) { attr.Value = value; } foreach (Protocol protocol in protocols) { string prot = protocol.ToString() + ":"; if (value.ToLowerInvariant().StartsWith(prot)) { return true; } } return false; }
public void MaybeSetBaseUri(Element baseEl) { if (_baseUriSetFromDoc) // only listen to the first <base href> in parse { return; } string href = baseEl.AbsUrl("href"); if (href.Length != 0) { // ignore <base target> etc _baseUri = href; _baseUriSetFromDoc = true; _doc.BaseUri = href; // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants } }
//Saves all elements in argument public void DownloadElements(Element[] elements, FileInfo[] fileInfo = null) { int totalDownloaded = 0; int totalBounced = 0; //Returns if there are no files to be downloaded if (fileInfo != null && fileInfo.Length == 0) { return; } for (int i = 0; i < elements.Length; i++) { Element content = elements[i]; string absURL; string tag = content.Tag.ToString(); switch (tag) { case "img": absURL = content.AbsUrl("src"); break; case "a": absURL = content.AbsUrl("href"); break; default: absURL = content.AbsUrl("src"); break; } data.mediaTrie.InsertURL(absURL); FileInfo file; //Doesn't recaculate file info if it doesn't have to if (fileInfo == null) { int nameIndex = absURL.LastIndexOf('/'); //Name of the element string elementName = Regex.Replace(absURL.Substring(nameIndex + 1), "[^A-Za-z.]", ""); //File location of the element string elementLocation = absURL.Substring(0, nameIndex); if (elementName.Length > 20) { elementName = elementName.Substring(elementName.Length - 20); } //Inserts hash into filename to avoid duplicates string hashCode = Convert.ToString(content.GetHashCode()); elementName = elementName.Insert(0, hashCode); if (!data.gallery) { file = new FileInfo(webStringUtils.UrlToDir(elementLocation) + elementName); } else { file = new FileInfo(data.outputFolder + elementName); } } else { file = fileInfo[i]; } //Defers downloading to the saver Save(absURL, file); //Sleeps to slow down image requests Thread.Sleep(data.delay); totalDownloaded++; } string report = "Downloaded " + totalDownloaded + " media files, denied " + totalBounced; CU.WCol(CU.nl + report + CU.nl, CU.c); }