public void normalisedBodyAfterContent() { Document doc = NSoupClient.Parse("<font face=Arial><body class=name><div>One</div></body></font>"); Assert.AreEqual("<html><head></head><body class=\"name\"><font face=\"Arial\"><div>One</div></font></body></html>", TextUtil.StripNewLines(doc.Html())); }
public List <BaseItem> GetList() { this.Items = new List <BaseItem>(); String temp = WebHelper.GetHtmlCodeByWebClient(this.Url, "utf-8"); Document doc = NSoupClient.Parse(temp); Element et = doc.Select("div[class=collgrid4w] div[class=items]").First; Elements list = et.Select("ul[class=v]"); foreach (Element item in list) { Element e1 = item.Select("li[class=v_link] a").First; String url = e1.Attr("href"); String title = e1.Attr("title"); Element e2 = item.Select("li[class=v_user] a").First; String upOwner = e2.OwnText(); Element e3 = item.Select("li[class=v_time] span[class=num]").First; String time = e3.OwnText(); BaseItem b = new BaseItem() { Name = title, Url = url, Time = time, Owner = upOwner }; this.Items.Add(b); } return(this.Items); }
public void followsTempRedirect() { IConnection con = NSoupClient.Connect("http://direct.infohound.net/tools/302.pl"); // http://jsoup.org Document doc = con.Get(); Assert.IsTrue(doc.Title.Contains("jsoup")); }
public void resolvesRelativeLinks() { String html = "<a href='/foo'>Link</a><img src='/bar'>"; String clean = NSoupClient.Clean(html, "http://example.com/", Whitelist.BasicWithImages); Assert.AreEqual("<a href=\"http://example.com/foo\" rel=\"nofollow\">Link</a>\n<img src=\"http://example.com/bar\" />", clean); }
public void supplyOutputSettings() { // test that one can override the default document output settings OutputSettings os = new OutputSettings(); os.PrettyPrint(false); os.SetEscapeMode(Entities.EscapeMode.Extended); string html = "<div><p>ℬ</p></div>"; string customOut = NSoupClient.Clean(html, "http://foo.com/", Whitelist.Relaxed, os); string defaultOut = NSoupClient.Clean(html, "http://foo.com/", Whitelist.Relaxed); Assert.AreNotSame(defaultOut, customOut); Assert.AreEqual("<div><p>ℬ</p></div>", customOut); Assert.AreEqual("<div>\n" + " <p>ℬ</p>\n" + "</div>", defaultOut); os.SetEncoding(Encoding.ASCII); os.SetEscapeMode(Entities.EscapeMode.Base); String customOut2 = NSoupClient.Clean(html, "http://foo.com/", Whitelist.Relaxed, os); Assert.AreEqual("<div><p>ℬ</p></div>", customOut2); }
public void InitBook() { foreach (var child in typeof(ChildEnum).GetEnumSource()) { foreach (var degree in typeof(DegreeEnum).GetEnumSource()) { var childId = child.Item1.NullToInt(); var degreeId = degree.Item1.NullToInt(); var url = string.Format("https://www.zujuan.com/question?chid={0}&xd={1}", child.Item1, degree.Item1); var html = new HttpUnitHelper().GetRealHtmlTrice(url); var doc = NSoupClient.Parse(html); //获取当前dgree下的科目下的教材版本 var bookTypeDoc = doc.Select("div.search-type div.con-items")[0].GetElementsByTag("a"); foreach (var element in bookTypeDoc) { var elementId = element.Attr("data-bcaid"); var name = element.Text(); Console.WriteLine("add book"); BookService.AddBook(childId, degreeId, name, elementId.NullToInt()); } } } }
public static Collection GetCollectionDetail(Collection collection, string text, Rule rule, string sourceUrl) { if (rule == null) { return(collection); } try { if (rule.item != null && rule.pictureRule != null && rule.pictureRule.item != null) { List <Collection> collections = new List <Collection>(); collections.Add(collection); collection = GetCollection(collections, text, rule, sourceUrl)[0]; } else { if (!IsJson(text)) { var element = NSoupClient.Parse(text); collection = GetCollectionDetail(collection, element, rule, sourceUrl); } else { var elemet = JToken.Parse(text); collection = GetCollectionDetail(collection, elemet, rule, sourceUrl); } } } catch (Exception) { } return(collection); }
public void commentBeforeHtml() { string h = "<!-- comment --><!-- comment 2 --><p>One</p>"; Document doc = NSoupClient.Parse(h); Assert.AreEqual("<!-- comment --><!-- comment 2 --><html><head></head><body><p>One</p></body></html>", TextUtil.StripNewLines(doc.Html())); }
public void emptyTdTag() { string h = "<table><tr><td>One</td><td id='2' /></tr></table>"; Document doc = NSoupClient.Parse(h); Assert.AreEqual("<td>One</td>\n<td id=\"2\"></td>", doc.Select("tr").First.Html()); }
public void testAFlowContents() { // html5 has <a> as either phrasing or block Document doc = NSoupClient.Parse("<a>Hello <div>there</div> <span>now</span></a>"); Assert.AreEqual("<a>Hello <div>there</div> <span>now</span></a>", TextUtil.StripNewLines(doc.Body.Html())); }
public void testFontFlowContents() { // html5 has no definition of <font>; often used as flow Document doc = NSoupClient.Parse("<font>Hello <div>there</div> <span>now</span></font>"); Assert.AreEqual("<font>Hello <div>there</div> <span>now</span></font>", TextUtil.StripNewLines(doc.Body.Html())); }
public void testNoImagesInNoScriptInHead() { // NSoupClient used to allow, but against spec if parsing with noscript Document doc = NSoupClient.Parse("<html><head><noscript><img src='foo'></noscript></head><body><p>Hello</p></body></html>"); Assert.AreEqual("<html><head><noscript></noscript></head><body><img src=\"foo\" /><p>Hello</p></body></html>", TextUtil.StripNewLines(doc.Html())); }
public void testSpanContents() { // like h1 tags, the spec says SPAN is phrasing only, but browsers and publisher treat span as a block tag Document doc = NSoupClient.Parse("<span>Hello <div>there</div> <span>now</span></span>"); Assert.AreEqual("<span>Hello <div>there</div> <span>now</span></span>", TextUtil.StripNewLines(doc.Body.Html())); }
public void testHgroup() { // NSoupClient used to not allow hroup in h{n}, but that's not in spec, and browsers are OK Document doc = NSoupClient.Parse("<h1>Hello <h2>There <hgroup><h1>Another<h2>headline</hgroup> <hgroup><h1>More</h1><p>stuff</p></hgroup>"); Assert.AreEqual("<h1>Hello </h1><h2>There <hgroup><h1>Another</h1><h2>headline</h2></hgroup> <hgroup><h1>More</h1><p>stuff</p></hgroup></h2>", TextUtil.StripNewLines(doc.Body.Html())); }
private void QiDianCrawler_OnCompleted(object sender, OnCompletedEventArgs e) { Document htmlDoc = NSoupClient.Parse(e.PageSource); Elements bookInfos = htmlDoc.GetElementsByClass("book-info"); var bookInfo = bookInfos[0]; var h1 = bookInfo.Children.FirstOrDefault(n => n.TagName() == "h1"); BookValue bookValue = new BookValue(); bookValue.BookName = h1.GetElementsByTag("em").Text; bookValue.BookAuthor = h1.GetElementsByTag("span")[0].GetElementsByTag("a").Text; bookValue.BookImage = bookInfo.GetElementsByClass("J-getJumpUrl")[0].Attr("href"); bookValue.BookSummary = htmlDoc.GetElementsByClass("book-intro")[0].GetElementsByTag("p").Html(); List <string> bookItems = new List <string>(); var volumes = htmlDoc.GetElementsByClass("volume"); foreach (var item in volumes) { var li = item.GetElementsByClass("cf")[0].GetElementsByTag("li"); foreach (var l in li) { bookItems.Add(l.Text()); } } }
public void noImplicitFormForTextAreas() { // old NSoupClient parser would create implicit forms for form children like <textarea>, but no more Document doc = NSoupClient.Parse("<textarea>One</textarea>"); Assert.AreEqual("<textarea>One</textarea>", doc.Body.Html()); }
public static void GrabTopKnowledge() { foreach (var chid in typeof(ChildEnum).GetEnumSource()) { foreach (var xd in typeof(DegreeEnum).GetEnumSource()) { //if (CateExist(cate.CategoryId)) // continue; var url = $"https://www.zujuan.com/question?chid={chid.Item1}&xd={xd.Item1}&tree_type=knowledge"; //var result = HttpClientHolder.GetRequest(url); var res = new HttpUnitHelper().GetRealHtmlTrice(url); var doc = NSoupClient.Parse(res); var topItems = doc.Select("#J_Tree ul li"); foreach (var item in topItems) { var id = item.Attr("data-treeid").NullToInt(); var name = item.GetElementsByTag("em")[0].Text(); var pid = 0; AddTree(name, pid, id, url); } } } }
public void handlesCommentsInTable() { string html = "<table><tr><td>text</td><!-- Comment --></tr></table>"; Document node = NSoupClient.ParseBodyFragment(html); Assert.AreEqual("<html><head></head><body><table><tbody><tr><td>text</td><!-- Comment --></tr></tbody></table></body></html>", TextUtil.StripNewLines(node.OuterHtml())); }
private async void button1_Click(object sender, EventArgs e) { listView1.Items.Clear(); try { string url = string.Format("http://zhannei.baidu.com/cse/search?s=1393206249994657467&q={0}", textBox1.Text); HttpClient client = new HttpClient(); string html = await client.GetStringAsync(url); //List<BookInfo> books = new List<BookInfo>(); Elements elements = NSoupClient.Parse(html).Select("div.result-list div.result-item"); foreach (var item in elements) { BookInfo bookInfo = BookResolver.GetBookInfo(item); ListViewItem viewItem = new ListViewItem(bookInfo.Name); viewItem.SubItems.Add(bookInfo.Author); viewItem.SubItems.Add(bookInfo.UpdateTime.ToString("yyyy-MM-dd")); viewItem.SubItems.Add(bookInfo.Loeva); viewItem.Tag = bookInfo; listView1.Items.Add(viewItem); } } catch (HttpRequestException httpRequestException) { Log.Info(httpRequestException.Message); if (MessageBox.Show("查询失败,是否重试", "温馨提示", MessageBoxButtons.OKCancel, MessageBoxIcon.Question) == DialogResult.OK) { button1_Click(null, null); } } }
public static void GrabTopCate() { var listCategory = DataService.GetCategorylist(); foreach (var cate in listCategory) { //if (CateExist(cate.CategoryId)) // continue; var url = $"https://www.zujuan.com/question?categories={cate.CategoryId}&bookversion={cate.BookVersionId}&nianji={cate.CategoryId}&chid={cate.Child}&xd={cate.Degree}"; //var result = HttpClientHolder.GetRequest(url); var res = new HttpUnitHelper().GetRealHtmlTrice(url); var doc = NSoupClient.Parse(res); var topItems = doc.Select("#J_Tree ul li"); foreach (var item in topItems) { var id = item.Attr("data-treeid").NullToInt(); var name = item.GetElementsByTag("em")[0].Text(); var pid = cate.CategoryId; AddTree(cate.CategoryId, name, pid, id); } } }
public List <BaseItem> GetList() { this.Items = new List <BaseItem>(); String temp = WebHelper.GetHtmlCodeByWebClientWithGzip(this.Url, "utf-8"); Document doc = NSoupClient.Parse(temp); Element et = doc.Select("div[class=items]").First; Elements list = et.Select("ul[class=v]"); foreach (Element item in list) { Element e1 = item.Select("li[class=v_title] a").First(); String url = e1.Attr("href"); String title = e1.Attr("title"); BaseItem b = new BaseItem() { Url = url, Name = title, Time = "未知", Owner = "未知" }; this.Items.Add(b); } return(this.Items); }
public void moreAttributeUnescapes() { String html = "<a href='&wr_id=123&mid-size=true&ok=&wr'>Check</a>"; Elements els = NSoupClient.Parse(html).Select("a"); Assert.AreEqual("&wr_id=123&mid-size=true&ok=&wr", els.First.Attr("href")); }
public void preservesRelatedLinksIfConfigured() { string html = "<a href='/foo'>Link</a><img src='/bar'> <img src='javascript:alert()'>"; string clean = NSoupClient.Clean(html, "http://example.com/", Whitelist.BasicWithImages.PreserveRelativeLinks(true)); Assert.AreEqual("<a href=\"/foo\" rel=\"nofollow\">Link</a>\n<img src=\"/bar\" /> \n<img />", clean); }
public void doesntRedirectIfSoConfigured() { IConnection con = NSoupClient.Connect("http://direct.infohound.net/tools/302.pl").FollowRedirects(false).IgnoreContentType(true); IResponse res = con.Execute(); Assert.IsTrue(res.StatusCode() == (System.Net.HttpStatusCode) 302); }
public List <BaseItem> GetList() { this.Items = new List <BaseItem>(); String temp = WebHelper.GetHtmlCodeByWebClientWithGzip(this.Url, "gbk"); Document doc = NSoupClient.Parse(temp); Elements items = doc.Select("div[class=pack pack_video_card]"); foreach (Element item in items) { Element e1 = item.Select("h1[class=caption] a").First; Element e2 = item.Select("ul[class=info]").First; String title = e1.Attr("title"); String url = e1.Attr("href"); String time = e2.Select("li").First.OwnText().Substring("时长:".Length); String upOwner = e2.Select("li a").First.Attr("title"); String key = url.Substring(url.LastIndexOf("/") + 1); key = key.Remove(key.IndexOf(".")); BaseItem b = new BaseItem() { Url = "http://www.tudou.com/programs/view/" + key + "/", Name = title, Time = time, Owner = upOwner }; this.Items.Add(b); } return(this.Items); }
public void InitCategory() { var listBook = BookService.GetBooklist(); foreach (var book in listBook) { var url = string.Format("https://www.zujuan.com/question?bookversion={0}&chid={1}&xd={2}", book.BookVersionId, book.Child, book.Degree); var html = new HttpUnitHelper().GetRealHtmlTrice(url); var doc = NSoupClient.Parse(html); //获取当前dgree下的科目下的教材版本 var categoryDoc = doc.Select("div.search-type div.con-items")[1].GetElementsByTag("a"); // var total = doc.Select("div.total b")[0].Text().NullToInt(); foreach (var element in categoryDoc) { var elementId = element.Attr("data-bcaid"); var name = element.Text(); DataService.AddCategory(book.Id, elementId.NullToInt(), name, total); } } }
public void testXhtmlReferences() { Document doc = NSoupClient.Parse("< > & " ' ×"); doc.OutputSettings().SetEscapeMode(Entities.EscapeMode.Xhtml); Assert.AreEqual("< > & " ' ×", doc.Body.Html()); }
public List <BaseItem> GetList() { this.Items = new List <BaseItem>(); String pageUrl = String.Format("http://www.tudou.com/albumplay/{0}", this.key); String temp = WebHelper.GetHtmlCodeByWebClientWithGzip(pageUrl, "gbk"); Document doc = NSoupClient.Parse(temp); String script = doc.Select("body>script").First.Html(); Match aidMatch = Regex.Match(script, @",aid=(\d*)"); String aid = aidMatch.Groups[1].Value; RestClient rc = new RestClient(); var request = new RestRequest("http://www.tudou.com/tvp/alist.action?a={aid}"); request.AddUrlSegment("aid", aid); ItemList dataItems = JsonConvert.DeserializeObject <ItemList>(rc.Execute(request).Content); foreach (Item item in dataItems.items) { this.items.Add(new BaseItem() { Time = item.time, Name = item.kw, Url = item.iid, Owner = "官方" }); } return(this.Items); }
public void followsRelativeRedirect() { IConnection con = NSoupClient.Connect("http://direct.infohound.net/tools/302-rel.pl"); // to ./ - /tools/ Document doc = con.Post(); Assert.IsTrue(doc.Title.Contains("HTML Tidy Online")); }
public void normalisesHeadlessBody() { Document doc = NSoupClient.Parse("<html><body><span class=\"foo\">bar</span>"); Assert.AreEqual("<html><head></head><body><span class=\"foo\">bar</span></body></html>", TextUtil.StripNewLines(doc.Html())); }