Esempio n. 1
0
        public void normalisedBodyAfterContent()
        {
            Document doc = NSoupClient.Parse("<font face=Arial><body class=name><div>One</div></body></font>");

            Assert.AreEqual("<html><head></head><body class=\"name\"><font face=\"Arial\"><div>One</div></font></body></html>",
                            TextUtil.StripNewLines(doc.Html()));
        }
Esempio n. 2
0
        public List <BaseItem> GetList()
        {
            this.Items = new List <BaseItem>();

            String   temp = WebHelper.GetHtmlCodeByWebClient(this.Url, "utf-8");
            Document doc  = NSoupClient.Parse(temp);
            Element  et   = doc.Select("div[class=collgrid4w] div[class=items]").First;
            Elements list = et.Select("ul[class=v]");

            foreach (Element item in list)
            {
                Element e1      = item.Select("li[class=v_link] a").First;
                String  url     = e1.Attr("href");
                String  title   = e1.Attr("title");
                Element e2      = item.Select("li[class=v_user] a").First;
                String  upOwner = e2.OwnText();
                Element e3      = item.Select("li[class=v_time] span[class=num]").First;
                String  time    = e3.OwnText();

                BaseItem b = new BaseItem()
                {
                    Name = title, Url = url, Time = time, Owner = upOwner
                };

                this.Items.Add(b);
            }

            return(this.Items);
        }
Esempio n. 3
0
        public void followsTempRedirect()
        {
            IConnection con = NSoupClient.Connect("http://direct.infohound.net/tools/302.pl"); // http://jsoup.org
            Document    doc = con.Get();

            Assert.IsTrue(doc.Title.Contains("jsoup"));
        }
        public void resolvesRelativeLinks()
        {
            String html  = "<a href='/foo'>Link</a><img src='/bar'>";
            String clean = NSoupClient.Clean(html, "http://example.com/", Whitelist.BasicWithImages);

            Assert.AreEqual("<a href=\"http://example.com/foo\" rel=\"nofollow\">Link</a>\n<img src=\"http://example.com/bar\" />", clean);
        }
        public void supplyOutputSettings()
        {
            // test that one can override the default document output settings
            OutputSettings os = new OutputSettings();

            os.PrettyPrint(false);
            os.SetEscapeMode(Entities.EscapeMode.Extended);

            string html       = "<div><p>&bernou;</p></div>";
            string customOut  = NSoupClient.Clean(html, "http://foo.com/", Whitelist.Relaxed, os);
            string defaultOut = NSoupClient.Clean(html, "http://foo.com/", Whitelist.Relaxed);

            Assert.AreNotSame(defaultOut, customOut);

            Assert.AreEqual("<div><p>&bernou;</p></div>", customOut);
            Assert.AreEqual("<div>\n" +
                            " <p>ℬ</p>\n" +
                            "</div>", defaultOut);

            os.SetEncoding(Encoding.ASCII);
            os.SetEscapeMode(Entities.EscapeMode.Base);
            String customOut2 = NSoupClient.Clean(html, "http://foo.com/", Whitelist.Relaxed, os);

            Assert.AreEqual("<div><p>&#8492;</p></div>", customOut2);
        }
Esempio n. 6
0
        public void InitBook()
        {
            foreach (var child in typeof(ChildEnum).GetEnumSource())
            {
                foreach (var degree in typeof(DegreeEnum).GetEnumSource())
                {
                    var childId  = child.Item1.NullToInt();
                    var degreeId = degree.Item1.NullToInt();
                    var url      = string.Format("https://www.zujuan.com/question?chid={0}&xd={1}", child.Item1, degree.Item1);

                    var html = new HttpUnitHelper().GetRealHtmlTrice(url);

                    var doc = NSoupClient.Parse(html);

                    //获取当前dgree下的科目下的教材版本
                    var bookTypeDoc = doc.Select("div.search-type div.con-items")[0].GetElementsByTag("a");

                    foreach (var element in bookTypeDoc)
                    {
                        var elementId = element.Attr("data-bcaid");
                        var name      = element.Text();

                        Console.WriteLine("add book");
                        BookService.AddBook(childId, degreeId, name, elementId.NullToInt());
                    }
                }
            }
        }
Esempio n. 7
0
        public static Collection GetCollectionDetail(Collection collection, string text, Rule rule, string sourceUrl)
        {
            if (rule == null)
            {
                return(collection);
            }
            try
            {
                if (rule.item != null && rule.pictureRule != null && rule.pictureRule.item != null)
                {
                    List <Collection> collections = new List <Collection>();
                    collections.Add(collection);
                    collection = GetCollection(collections, text, rule, sourceUrl)[0];
                }
                else
                {
                    if (!IsJson(text))
                    {
                        var element = NSoupClient.Parse(text);
                        collection = GetCollectionDetail(collection, element, rule, sourceUrl);
                    }
                    else
                    {
                        var elemet = JToken.Parse(text);
                        collection = GetCollectionDetail(collection, elemet, rule, sourceUrl);
                    }
                }
            }
            catch (Exception)
            {
            }

            return(collection);
        }
Esempio n. 8
0
        public void commentBeforeHtml()
        {
            string   h   = "<!-- comment --><!-- comment 2 --><p>One</p>";
            Document doc = NSoupClient.Parse(h);

            Assert.AreEqual("<!-- comment --><!-- comment 2 --><html><head></head><body><p>One</p></body></html>", TextUtil.StripNewLines(doc.Html()));
        }
Esempio n. 9
0
        public void emptyTdTag()
        {
            string   h   = "<table><tr><td>One</td><td id='2' /></tr></table>";
            Document doc = NSoupClient.Parse(h);

            Assert.AreEqual("<td>One</td>\n<td id=\"2\"></td>", doc.Select("tr").First.Html());
        }
Esempio n. 10
0
        public void testAFlowContents()
        {
            // html5 has <a> as either phrasing or block
            Document doc = NSoupClient.Parse("<a>Hello <div>there</div> <span>now</span></a>");

            Assert.AreEqual("<a>Hello <div>there</div> <span>now</span></a>", TextUtil.StripNewLines(doc.Body.Html()));
        }
Esempio n. 11
0
        public void testFontFlowContents()
        {
            // html5 has no definition of <font>; often used as flow
            Document doc = NSoupClient.Parse("<font>Hello <div>there</div> <span>now</span></font>");

            Assert.AreEqual("<font>Hello <div>there</div> <span>now</span></font>", TextUtil.StripNewLines(doc.Body.Html()));
        }
Esempio n. 12
0
        public void testNoImagesInNoScriptInHead()
        {
            // NSoupClient used to allow, but against spec if parsing with noscript
            Document doc = NSoupClient.Parse("<html><head><noscript><img src='foo'></noscript></head><body><p>Hello</p></body></html>");

            Assert.AreEqual("<html><head><noscript></noscript></head><body><img src=\"foo\" /><p>Hello</p></body></html>", TextUtil.StripNewLines(doc.Html()));
        }
Esempio n. 13
0
        public void testSpanContents()
        {
            // like h1 tags, the spec says SPAN is phrasing only, but browsers and publisher treat span as a block tag
            Document doc = NSoupClient.Parse("<span>Hello <div>there</div> <span>now</span></span>");

            Assert.AreEqual("<span>Hello <div>there</div> <span>now</span></span>", TextUtil.StripNewLines(doc.Body.Html()));
        }
Esempio n. 14
0
        public void testHgroup()
        {
            // NSoupClient used to not allow hroup in h{n}, but that's not in spec, and browsers are OK
            Document doc = NSoupClient.Parse("<h1>Hello <h2>There <hgroup><h1>Another<h2>headline</hgroup> <hgroup><h1>More</h1><p>stuff</p></hgroup>");

            Assert.AreEqual("<h1>Hello </h1><h2>There <hgroup><h1>Another</h1><h2>headline</h2></hgroup> <hgroup><h1>More</h1><p>stuff</p></hgroup></h2>", TextUtil.StripNewLines(doc.Body.Html()));
        }
Esempio n. 15
0
        private void QiDianCrawler_OnCompleted(object sender, OnCompletedEventArgs e)
        {
            Document  htmlDoc   = NSoupClient.Parse(e.PageSource);
            Elements  bookInfos = htmlDoc.GetElementsByClass("book-info");
            var       bookInfo  = bookInfos[0];
            var       h1        = bookInfo.Children.FirstOrDefault(n => n.TagName() == "h1");
            BookValue bookValue = new BookValue();

            bookValue.BookName    = h1.GetElementsByTag("em").Text;
            bookValue.BookAuthor  = h1.GetElementsByTag("span")[0].GetElementsByTag("a").Text;
            bookValue.BookImage   = bookInfo.GetElementsByClass("J-getJumpUrl")[0].Attr("href");
            bookValue.BookSummary = htmlDoc.GetElementsByClass("book-intro")[0].GetElementsByTag("p").Html();

            List <string> bookItems = new List <string>();
            var           volumes   = htmlDoc.GetElementsByClass("volume");

            foreach (var item in volumes)
            {
                var li = item.GetElementsByClass("cf")[0].GetElementsByTag("li");
                foreach (var l in li)
                {
                    bookItems.Add(l.Text());
                }
            }
        }
Esempio n. 16
0
        public void noImplicitFormForTextAreas()
        {
            // old NSoupClient parser would create implicit forms for form children like <textarea>, but no more
            Document doc = NSoupClient.Parse("<textarea>One</textarea>");

            Assert.AreEqual("<textarea>One</textarea>", doc.Body.Html());
        }
Esempio n. 17
0
        public static void GrabTopKnowledge()
        {
            foreach (var chid in typeof(ChildEnum).GetEnumSource())
            {
                foreach (var xd in typeof(DegreeEnum).GetEnumSource())
                {
                    //if (CateExist(cate.CategoryId))
                    //    continue;
                    var url =
                        $"https://www.zujuan.com/question?chid={chid.Item1}&xd={xd.Item1}&tree_type=knowledge";
                    //var result = HttpClientHolder.GetRequest(url);
                    var res = new HttpUnitHelper().GetRealHtmlTrice(url);
                    var doc = NSoupClient.Parse(res);

                    var topItems = doc.Select("#J_Tree ul li");

                    foreach (var item in topItems)
                    {
                        var id   = item.Attr("data-treeid").NullToInt();
                        var name = item.GetElementsByTag("em")[0].Text();
                        var pid  = 0;

                        AddTree(name, pid, id, url);
                    }
                }
            }
        }
Esempio n. 18
0
        public void handlesCommentsInTable()
        {
            string   html = "<table><tr><td>text</td><!-- Comment --></tr></table>";
            Document node = NSoupClient.ParseBodyFragment(html);

            Assert.AreEqual("<html><head></head><body><table><tbody><tr><td>text</td><!-- Comment --></tr></tbody></table></body></html>", TextUtil.StripNewLines(node.OuterHtml()));
        }
Esempio n. 19
0
        private async void button1_Click(object sender, EventArgs e)
        {
            listView1.Items.Clear();
            try
            {
                string     url    = string.Format("http://zhannei.baidu.com/cse/search?s=1393206249994657467&q={0}", textBox1.Text);
                HttpClient client = new HttpClient();
                string     html   = await client.GetStringAsync(url);

                //List<BookInfo> books = new List<BookInfo>();

                Elements elements = NSoupClient.Parse(html).Select("div.result-list div.result-item");

                foreach (var item in elements)
                {
                    BookInfo bookInfo = BookResolver.GetBookInfo(item);

                    ListViewItem viewItem = new ListViewItem(bookInfo.Name);
                    viewItem.SubItems.Add(bookInfo.Author);
                    viewItem.SubItems.Add(bookInfo.UpdateTime.ToString("yyyy-MM-dd"));
                    viewItem.SubItems.Add(bookInfo.Loeva);
                    viewItem.Tag = bookInfo;
                    listView1.Items.Add(viewItem);
                }
            }
            catch (HttpRequestException httpRequestException)
            {
                Log.Info(httpRequestException.Message);

                if (MessageBox.Show("查询失败,是否重试", "温馨提示", MessageBoxButtons.OKCancel, MessageBoxIcon.Question) == DialogResult.OK)
                {
                    button1_Click(null, null);
                }
            }
        }
Esempio n. 20
0
        public static void GrabTopCate()
        {
            var listCategory = DataService.GetCategorylist();

            foreach (var cate in listCategory)
            {
                //if (CateExist(cate.CategoryId))
                //    continue;
                var url =
                    $"https://www.zujuan.com/question?categories={cate.CategoryId}&bookversion={cate.BookVersionId}&nianji={cate.CategoryId}&chid={cate.Child}&xd={cate.Degree}";
                //var result = HttpClientHolder.GetRequest(url);
                var res = new HttpUnitHelper().GetRealHtmlTrice(url);
                var doc = NSoupClient.Parse(res);

                var topItems = doc.Select("#J_Tree ul li");

                foreach (var item in topItems)
                {
                    var id   = item.Attr("data-treeid").NullToInt();
                    var name = item.GetElementsByTag("em")[0].Text();
                    var pid  = cate.CategoryId;

                    AddTree(cate.CategoryId, name, pid, id);
                }
            }
        }
Esempio n. 21
0
        public List <BaseItem> GetList()
        {
            this.Items = new List <BaseItem>();

            String   temp = WebHelper.GetHtmlCodeByWebClientWithGzip(this.Url, "utf-8");
            Document doc  = NSoupClient.Parse(temp);
            Element  et   = doc.Select("div[class=items]").First;
            Elements list = et.Select("ul[class=v]");

            foreach (Element item in list)
            {
                Element e1    = item.Select("li[class=v_title] a").First();
                String  url   = e1.Attr("href");
                String  title = e1.Attr("title");

                BaseItem b = new BaseItem()
                {
                    Url = url, Name = title, Time = "未知", Owner = "未知"
                };

                this.Items.Add(b);
            }

            return(this.Items);
        }
Esempio n. 22
0
        public void moreAttributeUnescapes()
        {
            String   html = "<a href='&wr_id=123&mid-size=true&ok=&wr'>Check</a>";
            Elements els  = NSoupClient.Parse(html).Select("a");

            Assert.AreEqual("&wr_id=123&mid-size=true&ok=&wr", els.First.Attr("href"));
        }
        public void preservesRelatedLinksIfConfigured()
        {
            string html  = "<a href='/foo'>Link</a><img src='/bar'> <img src='javascript:alert()'>";
            string clean = NSoupClient.Clean(html, "http://example.com/", Whitelist.BasicWithImages.PreserveRelativeLinks(true));

            Assert.AreEqual("<a href=\"/foo\" rel=\"nofollow\">Link</a>\n<img src=\"/bar\" /> \n<img />", clean);
        }
Esempio n. 24
0
        public void doesntRedirectIfSoConfigured()
        {
            IConnection con = NSoupClient.Connect("http://direct.infohound.net/tools/302.pl").FollowRedirects(false).IgnoreContentType(true);
            IResponse   res = con.Execute();

            Assert.IsTrue(res.StatusCode() == (System.Net.HttpStatusCode) 302);
        }
Esempio n. 25
0
        public List <BaseItem> GetList()
        {
            this.Items = new List <BaseItem>();

            String   temp  = WebHelper.GetHtmlCodeByWebClientWithGzip(this.Url, "gbk");
            Document doc   = NSoupClient.Parse(temp);
            Elements items = doc.Select("div[class=pack pack_video_card]");

            foreach (Element item in items)
            {
                Element e1      = item.Select("h1[class=caption] a").First;
                Element e2      = item.Select("ul[class=info]").First;
                String  title   = e1.Attr("title");
                String  url     = e1.Attr("href");
                String  time    = e2.Select("li").First.OwnText().Substring("时长:".Length);
                String  upOwner = e2.Select("li a").First.Attr("title");
                String  key     = url.Substring(url.LastIndexOf("/") + 1);
                key = key.Remove(key.IndexOf("."));

                BaseItem b = new BaseItem()
                {
                    Url = "http://www.tudou.com/programs/view/" + key + "/", Name = title, Time = time, Owner = upOwner
                };

                this.Items.Add(b);
            }

            return(this.Items);
        }
Esempio n. 26
0
        public void InitCategory()
        {
            var listBook = BookService.GetBooklist();

            foreach (var book in listBook)
            {
                var url = string.Format("https://www.zujuan.com/question?bookversion={0}&chid={1}&xd={2}",
                                        book.BookVersionId, book.Child, book.Degree);

                var html = new HttpUnitHelper().GetRealHtmlTrice(url);
                var doc  = NSoupClient.Parse(html);

                //获取当前dgree下的科目下的教材版本
                var categoryDoc = doc.Select("div.search-type div.con-items")[1].GetElementsByTag("a");
                //
                var total = doc.Select("div.total b")[0].Text().NullToInt();
                foreach (var element in categoryDoc)
                {
                    var elementId = element.Attr("data-bcaid");
                    var name      = element.Text();

                    DataService.AddCategory(book.Id, elementId.NullToInt(), name, total);
                }
            }
        }
Esempio n. 27
0
        public void testXhtmlReferences()
        {
            Document doc = NSoupClient.Parse("&lt; &gt; &amp; &quot; &apos; &times;");

            doc.OutputSettings().SetEscapeMode(Entities.EscapeMode.Xhtml);
            Assert.AreEqual("&lt; &gt; &amp; &quot; &apos; ×", doc.Body.Html());
        }
Esempio n. 28
0
        public List <BaseItem> GetList()
        {
            this.Items = new List <BaseItem>();

            String   pageUrl  = String.Format("http://www.tudou.com/albumplay/{0}", this.key);
            String   temp     = WebHelper.GetHtmlCodeByWebClientWithGzip(pageUrl, "gbk");
            Document doc      = NSoupClient.Parse(temp);
            String   script   = doc.Select("body>script").First.Html();
            Match    aidMatch = Regex.Match(script, @",aid=(\d*)");
            String   aid      = aidMatch.Groups[1].Value;

            RestClient rc      = new RestClient();
            var        request = new RestRequest("http://www.tudou.com/tvp/alist.action?a={aid}");

            request.AddUrlSegment("aid", aid);
            ItemList dataItems = JsonConvert.DeserializeObject <ItemList>(rc.Execute(request).Content);

            foreach (Item item in dataItems.items)
            {
                this.items.Add(new BaseItem()
                {
                    Time = item.time, Name = item.kw, Url = item.iid, Owner = "官方"
                });
            }

            return(this.Items);
        }
Esempio n. 29
0
        public void followsRelativeRedirect()
        {
            IConnection con = NSoupClient.Connect("http://direct.infohound.net/tools/302-rel.pl"); // to ./ - /tools/
            Document    doc = con.Post();

            Assert.IsTrue(doc.Title.Contains("HTML Tidy Online"));
        }
Esempio n. 30
0
        public void normalisesHeadlessBody()
        {
            Document doc = NSoupClient.Parse("<html><body><span class=\"foo\">bar</span>");

            Assert.AreEqual("<html><head></head><body><span class=\"foo\">bar</span></body></html>",
                            TextUtil.StripNewLines(doc.Html()));
        }