public void normalisedBodyAfterContent()
            Document doc = NSoupClient.Parse("<font face=Arial><body class=name><div>One</div></body></font>");

            Assert.AreEqual("<html><head></head><body class=\"name\"><font face=\"Arial\"><div>One</div></font></body></html>",
Exemple #2
        public List <BaseItem> GetList()
            this.Items = new List <BaseItem>();

            String   temp = WebHelper.GetHtmlCodeByWebClient(this.Url, "utf-8");
            Document doc  = NSoupClient.Parse(temp);
            Element  et   = doc.Select("div[class=collgrid4w] div[class=items]").First;
            Elements list = et.Select("ul[class=v]");

            foreach (Element item in list)
                Element e1      = item.Select("li[class=v_link] a").First;
                String  url     = e1.Attr("href");
                String  title   = e1.Attr("title");
                Element e2      = item.Select("li[class=v_user] a").First;
                String  upOwner = e2.OwnText();
                Element e3      = item.Select("li[class=v_time] span[class=num]").First;
                String  time    = e3.OwnText();

                BaseItem b = new BaseItem()
                    Name = title, Url = url, Time = time, Owner = upOwner


Exemple #3
        public void followsTempRedirect()
            IConnection con = NSoupClient.Connect(""); //
            Document    doc = con.Get();

        public void resolvesRelativeLinks()
            String html  = "<a href='/foo'>Link</a><img src='/bar'>";
            String clean = NSoupClient.Clean(html, "", Whitelist.BasicWithImages);

            Assert.AreEqual("<a href=\"\" rel=\"nofollow\">Link</a>\n<img src=\"\" />", clean);
        public void supplyOutputSettings()
            // test that one can override the default document output settings
            OutputSettings os = new OutputSettings();


            string html       = "<div><p>&bernou;</p></div>";
            string customOut  = NSoupClient.Clean(html, "", Whitelist.Relaxed, os);
            string defaultOut = NSoupClient.Clean(html, "", Whitelist.Relaxed);

            Assert.AreNotSame(defaultOut, customOut);

            Assert.AreEqual("<div><p>&bernou;</p></div>", customOut);
            Assert.AreEqual("<div>\n" +
                            " <p>ℬ</p>\n" +
                            "</div>", defaultOut);

            String customOut2 = NSoupClient.Clean(html, "", Whitelist.Relaxed, os);

            Assert.AreEqual("<div><p>&#8492;</p></div>", customOut2);
Exemple #6
        public void InitBook()
            foreach (var child in typeof(ChildEnum).GetEnumSource())
                foreach (var degree in typeof(DegreeEnum).GetEnumSource())
                    var childId  = child.Item1.NullToInt();
                    var degreeId = degree.Item1.NullToInt();
                    var url      = string.Format("{0}&xd={1}", child.Item1, degree.Item1);

                    var html = new HttpUnitHelper().GetRealHtmlTrice(url);

                    var doc = NSoupClient.Parse(html);

                    var bookTypeDoc = doc.Select(" div.con-items")[0].GetElementsByTag("a");

                    foreach (var element in bookTypeDoc)
                        var elementId = element.Attr("data-bcaid");
                        var name      = element.Text();

                        Console.WriteLine("add book");
                        BookService.AddBook(childId, degreeId, name, elementId.NullToInt());
Exemple #7
        public static Collection GetCollectionDetail(Collection collection, string text, Rule rule, string sourceUrl)
            if (rule == null)
                if (rule.item != null && rule.pictureRule != null && rule.pictureRule.item != null)
                    List <Collection> collections = new List <Collection>();
                    collection = GetCollection(collections, text, rule, sourceUrl)[0];
                    if (!IsJson(text))
                        var element = NSoupClient.Parse(text);
                        collection = GetCollectionDetail(collection, element, rule, sourceUrl);
                        var elemet = JToken.Parse(text);
                        collection = GetCollectionDetail(collection, elemet, rule, sourceUrl);
            catch (Exception)

        public void commentBeforeHtml()
            string   h   = "<!-- comment --><!-- comment 2 --><p>One</p>";
            Document doc = NSoupClient.Parse(h);

            Assert.AreEqual("<!-- comment --><!-- comment 2 --><html><head></head><body><p>One</p></body></html>", TextUtil.StripNewLines(doc.Html()));
        public void emptyTdTag()
            string   h   = "<table><tr><td>One</td><td id='2' /></tr></table>";
            Document doc = NSoupClient.Parse(h);

            Assert.AreEqual("<td>One</td>\n<td id=\"2\"></td>", doc.Select("tr").First.Html());
        public void testAFlowContents()
            // html5 has <a> as either phrasing or block
            Document doc = NSoupClient.Parse("<a>Hello <div>there</div> <span>now</span></a>");

            Assert.AreEqual("<a>Hello <div>there</div> <span>now</span></a>", TextUtil.StripNewLines(doc.Body.Html()));
        public void testFontFlowContents()
            // html5 has no definition of <font>; often used as flow
            Document doc = NSoupClient.Parse("<font>Hello <div>there</div> <span>now</span></font>");

            Assert.AreEqual("<font>Hello <div>there</div> <span>now</span></font>", TextUtil.StripNewLines(doc.Body.Html()));
        public void testNoImagesInNoScriptInHead()
            // NSoupClient used to allow, but against spec if parsing with noscript
            Document doc = NSoupClient.Parse("<html><head><noscript><img src='foo'></noscript></head><body><p>Hello</p></body></html>");

            Assert.AreEqual("<html><head><noscript></noscript></head><body><img src=\"foo\" /><p>Hello</p></body></html>", TextUtil.StripNewLines(doc.Html()));
        public void testSpanContents()
            // like h1 tags, the spec says SPAN is phrasing only, but browsers and publisher treat span as a block tag
            Document doc = NSoupClient.Parse("<span>Hello <div>there</div> <span>now</span></span>");

            Assert.AreEqual("<span>Hello <div>there</div> <span>now</span></span>", TextUtil.StripNewLines(doc.Body.Html()));
        public void testHgroup()
            // NSoupClient used to not allow hroup in h{n}, but that's not in spec, and browsers are OK
            Document doc = NSoupClient.Parse("<h1>Hello <h2>There <hgroup><h1>Another<h2>headline</hgroup> <hgroup><h1>More</h1><p>stuff</p></hgroup>");

            Assert.AreEqual("<h1>Hello </h1><h2>There <hgroup><h1>Another</h1><h2>headline</h2></hgroup> <hgroup><h1>More</h1><p>stuff</p></hgroup></h2>", TextUtil.StripNewLines(doc.Body.Html()));
Exemple #15
        private void QiDianCrawler_OnCompleted(object sender, OnCompletedEventArgs e)
            Document  htmlDoc   = NSoupClient.Parse(e.PageSource);
            Elements  bookInfos = htmlDoc.GetElementsByClass("book-info");
            var       bookInfo  = bookInfos[0];
            var       h1        = bookInfo.Children.FirstOrDefault(n => n.TagName() == "h1");
            BookValue bookValue = new BookValue();

            bookValue.BookName    = h1.GetElementsByTag("em").Text;
            bookValue.BookAuthor  = h1.GetElementsByTag("span")[0].GetElementsByTag("a").Text;
            bookValue.BookImage   = bookInfo.GetElementsByClass("J-getJumpUrl")[0].Attr("href");
            bookValue.BookSummary = htmlDoc.GetElementsByClass("book-intro")[0].GetElementsByTag("p").Html();

            List <string> bookItems = new List <string>();
            var           volumes   = htmlDoc.GetElementsByClass("volume");

            foreach (var item in volumes)
                var li = item.GetElementsByClass("cf")[0].GetElementsByTag("li");
                foreach (var l in li)
        public void noImplicitFormForTextAreas()
            // old NSoupClient parser would create implicit forms for form children like <textarea>, but no more
            Document doc = NSoupClient.Parse("<textarea>One</textarea>");

            Assert.AreEqual("<textarea>One</textarea>", doc.Body.Html());
Exemple #17
        public static void GrabTopKnowledge()
            foreach (var chid in typeof(ChildEnum).GetEnumSource())
                foreach (var xd in typeof(DegreeEnum).GetEnumSource())
                    //if (CateExist(cate.CategoryId))
                    //    continue;
                    var url =
                    //var result = HttpClientHolder.GetRequest(url);
                    var res = new HttpUnitHelper().GetRealHtmlTrice(url);
                    var doc = NSoupClient.Parse(res);

                    var topItems = doc.Select("#J_Tree ul li");

                    foreach (var item in topItems)
                        var id   = item.Attr("data-treeid").NullToInt();
                        var name = item.GetElementsByTag("em")[0].Text();
                        var pid  = 0;

                        AddTree(name, pid, id, url);
        public void handlesCommentsInTable()
            string   html = "<table><tr><td>text</td><!-- Comment --></tr></table>";
            Document node = NSoupClient.ParseBodyFragment(html);

            Assert.AreEqual("<html><head></head><body><table><tbody><tr><td>text</td><!-- Comment --></tr></tbody></table></body></html>", TextUtil.StripNewLines(node.OuterHtml()));
Exemple #19
        private async void button1_Click(object sender, EventArgs e)
                string     url    = string.Format("{0}", textBox1.Text);
                HttpClient client = new HttpClient();
                string     html   = await client.GetStringAsync(url);

                //List<BookInfo> books = new List<BookInfo>();

                Elements elements = NSoupClient.Parse(html).Select("div.result-list div.result-item");

                foreach (var item in elements)
                    BookInfo bookInfo = BookResolver.GetBookInfo(item);

                    ListViewItem viewItem = new ListViewItem(bookInfo.Name);
                    viewItem.Tag = bookInfo;
            catch (HttpRequestException httpRequestException)

                if (MessageBox.Show("查询失败,是否重试", "温馨提示", MessageBoxButtons.OKCancel, MessageBoxIcon.Question) == DialogResult.OK)
                    button1_Click(null, null);
Exemple #20
        public static void GrabTopCate()
            var listCategory = DataService.GetCategorylist();

            foreach (var cate in listCategory)
                //if (CateExist(cate.CategoryId))
                //    continue;
                var url =
                //var result = HttpClientHolder.GetRequest(url);
                var res = new HttpUnitHelper().GetRealHtmlTrice(url);
                var doc = NSoupClient.Parse(res);

                var topItems = doc.Select("#J_Tree ul li");

                foreach (var item in topItems)
                    var id   = item.Attr("data-treeid").NullToInt();
                    var name = item.GetElementsByTag("em")[0].Text();
                    var pid  = cate.CategoryId;

                    AddTree(cate.CategoryId, name, pid, id);
        public List <BaseItem> GetList()
            this.Items = new List <BaseItem>();

            String   temp = WebHelper.GetHtmlCodeByWebClientWithGzip(this.Url, "utf-8");
            Document doc  = NSoupClient.Parse(temp);
            Element  et   = doc.Select("div[class=items]").First;
            Elements list = et.Select("ul[class=v]");

            foreach (Element item in list)
                Element e1    = item.Select("li[class=v_title] a").First();
                String  url   = e1.Attr("href");
                String  title = e1.Attr("title");

                BaseItem b = new BaseItem()
                    Url = url, Name = title, Time = "未知", Owner = "未知"


Exemple #22
        public void moreAttributeUnescapes()
            String   html = "<a href='&wr_id=123&mid-size=true&ok=&wr'>Check</a>";
            Elements els  = NSoupClient.Parse(html).Select("a");

            Assert.AreEqual("&wr_id=123&mid-size=true&ok=&wr", els.First.Attr("href"));
        public void preservesRelatedLinksIfConfigured()
            string html  = "<a href='/foo'>Link</a><img src='/bar'> <img src='javascript:alert()'>";
            string clean = NSoupClient.Clean(html, "", Whitelist.BasicWithImages.PreserveRelativeLinks(true));

            Assert.AreEqual("<a href=\"/foo\" rel=\"nofollow\">Link</a>\n<img src=\"/bar\" /> \n<img />", clean);
        public void doesntRedirectIfSoConfigured()
            IConnection con = NSoupClient.Connect("").FollowRedirects(false).IgnoreContentType(true);
            IResponse   res = con.Execute();

            Assert.IsTrue(res.StatusCode() == (System.Net.HttpStatusCode) 302);
Exemple #25
        public List <BaseItem> GetList()
            this.Items = new List <BaseItem>();

            String   temp  = WebHelper.GetHtmlCodeByWebClientWithGzip(this.Url, "gbk");
            Document doc   = NSoupClient.Parse(temp);
            Elements items = doc.Select("div[class=pack pack_video_card]");

            foreach (Element item in items)
                Element e1      = item.Select("h1[class=caption] a").First;
                Element e2      = item.Select("ul[class=info]").First;
                String  title   = e1.Attr("title");
                String  url     = e1.Attr("href");
                String  time    = e2.Select("li").First.OwnText().Substring("时长:".Length);
                String  upOwner = e2.Select("li a").First.Attr("title");
                String  key     = url.Substring(url.LastIndexOf("/") + 1);
                key = key.Remove(key.IndexOf("."));

                BaseItem b = new BaseItem()
                    Url = "" + key + "/", Name = title, Time = time, Owner = upOwner


        public void InitCategory()
            var listBook = BookService.GetBooklist();

            foreach (var book in listBook)
                var url = string.Format("{0}&chid={1}&xd={2}",
                                        book.BookVersionId, book.Child, book.Degree);

                var html = new HttpUnitHelper().GetRealHtmlTrice(url);
                var doc  = NSoupClient.Parse(html);

                var categoryDoc = doc.Select(" div.con-items")[1].GetElementsByTag("a");
                var total = doc.Select(" b")[0].Text().NullToInt();
                foreach (var element in categoryDoc)
                    var elementId = element.Attr("data-bcaid");
                    var name      = element.Text();

                    DataService.AddCategory(book.Id, elementId.NullToInt(), name, total);
Exemple #27
        public void testXhtmlReferences()
            Document doc = NSoupClient.Parse("&lt; &gt; &amp; &quot; &apos; &times;");

            Assert.AreEqual("&lt; &gt; &amp; &quot; &apos; ×", doc.Body.Html());
Exemple #28
        public List <BaseItem> GetList()
            this.Items = new List <BaseItem>();

            String   pageUrl  = String.Format("{0}", this.key);
            String   temp     = WebHelper.GetHtmlCodeByWebClientWithGzip(pageUrl, "gbk");
            Document doc      = NSoupClient.Parse(temp);
            String   script   = doc.Select("body>script").First.Html();
            Match    aidMatch = Regex.Match(script, @",aid=(\d*)");
            String   aid      = aidMatch.Groups[1].Value;

            RestClient rc      = new RestClient();
            var        request = new RestRequest("{aid}");

            request.AddUrlSegment("aid", aid);
            ItemList dataItems = JsonConvert.DeserializeObject <ItemList>(rc.Execute(request).Content);

            foreach (Item item in dataItems.items)
                this.items.Add(new BaseItem()
                    Time = item.time, Name =, Url = item.iid, Owner = "官方"

Exemple #29
        public void followsRelativeRedirect()
            IConnection con = NSoupClient.Connect(""); // to ./ - /tools/
            Document    doc = con.Post();

            Assert.IsTrue(doc.Title.Contains("HTML Tidy Online"));
        public void normalisesHeadlessBody()
            Document doc = NSoupClient.Parse("<html><body><span class=\"foo\">bar</span>");

            Assert.AreEqual("<html><head></head><body><span class=\"foo\">bar</span></body></html>",