Beispiel #1
0
        /// <summary>
        /// 使用Xpath解析数据
        /// </summary>
        /// <param name="doc"></param>
        /// <param name="count"></param>
        private static void ParseDataWithXpath(IHtmlDocument doc, ref int count)
        {
            HtmlDocument hdoc = new HtmlDocument();

            hdoc.LoadHtml(doc.InnerHtml().ToString());

            HtmlNode books = hdoc.DocumentNode.SelectSingleNode("/html/body/div[1]/div/div/div[2]/div/div[3]/div");

            foreach (var bookinfo in books.ChildNodes)
            {
                var book = new Book
                {
                    Title      = bookinfo.SelectSingleNode("div/div/div/div[3]/a").InnerText,
                    Author     = bookinfo.SelectSingleNode("div/div/div/div[5]/text()[1]").InnerText.Replace("作者: ", ""),
                    WordNumber = bookinfo.SelectSingleNode("div/div/div/div[5]/text()[2]").InnerText,
                    UpdateTime = bookinfo.SelectSingleNode("div/div/div/div[5]/text()[3]").InnerText,
                    Score      = bookinfo.SelectSingleNode("div/div/div/div[5]/span").InnerText,
                    RateNumber = bookinfo.SelectSingleNode("div/div/div/div[4]/span[3]").InnerText,
                    Url        = new Uri("http://www.yousuu.com" + bookinfo.SelectSingleNode("div/div/div/div[3]/a").Attributes["href"].Value)
                };
                lock (bookList)
                {
                    if (!bookList.Contains(book))
                    {
                        count++;
                        bookList.Add(book); //将数据加入到泛型列表
                    }
                }
                //Console.WriteLine(book.ToString());//将书籍信息显示到控制台
            }
        }
Beispiel #2
0
        /// <summary>
        /// 使用正则表达式解析数据
        /// </summary>
        /// <param name="doc"></param>
        /// <param name="count"></param>
        private static void ParseDataWithRegex(IHtmlDocument doc, ref int count)
        {
            // 使用正则表达式清洗网页源代码中的数据
            string pattern =
                "<div class=\"title\"><a href=\"(?<href>/book/[\\d]+)\" target=\"_blank\">(?<title>[\\w\\s《》—]+)</a></div><div class=\"rating\"><span class=\"allstar00\"></span><span class=\"rating_nums\"></span><span>\\((?<ratenumber>\\w+)评价\\)</span></div><div class=\"abstract\">作者:\\s*(?<author>[\\w\\s-]+)\\s*<br />字数:\\s*(?<wordnumber>[\\w\\.]+)\\s*<br />最后更新:\\s*(?<updatetime>[\\w\\s]+)<br />综合评分:\\s*<span class=\"num2star\">(?<score>[\\w\\.]+)</span></div>";

            var links = Regex.Matches(doc.InnerHtml().ToString(), pattern, RegexOptions.IgnoreCase);

            foreach (Match match in links)
            {
                var book = new Book
                {
                    Title      = match.Groups["title"].Value,
                    Author     = match.Groups["author"].Value,
                    WordNumber = match.Groups["wordnumber"].Value,
                    UpdateTime = match.Groups["updatetime"].Value,
                    Score      = match.Groups["score"].Value,
                    RateNumber = match.Groups["ratenumber"].Value,
                    Url        = new Uri("http://www.yousuu.com" + match.Groups["href"].Value)
                };
                lock (bookList)
                {
                    if (!bookList.Contains(book))
                    {
                        count++;
                        bookList.Add(book); //将数据加入到泛型列表
                    }
                }

                //Console.WriteLine(book.ToString());//将书籍信息显示到控制台
            }
        }