/// <summary> /// 使用Xpath解析数据 /// </summary> /// <param name="doc"></param> /// <param name="count"></param> private static void ParseDataWithXpath(IHtmlDocument doc, ref int count) { HtmlDocument hdoc = new HtmlDocument(); hdoc.LoadHtml(doc.InnerHtml().ToString()); HtmlNode books = hdoc.DocumentNode.SelectSingleNode("/html/body/div[1]/div/div/div[2]/div/div[3]/div"); foreach (var bookinfo in books.ChildNodes) { var book = new Book { Title = bookinfo.SelectSingleNode("div/div/div/div[3]/a").InnerText, Author = bookinfo.SelectSingleNode("div/div/div/div[5]/text()[1]").InnerText.Replace("作者: ", ""), WordNumber = bookinfo.SelectSingleNode("div/div/div/div[5]/text()[2]").InnerText, UpdateTime = bookinfo.SelectSingleNode("div/div/div/div[5]/text()[3]").InnerText, Score = bookinfo.SelectSingleNode("div/div/div/div[5]/span").InnerText, RateNumber = bookinfo.SelectSingleNode("div/div/div/div[4]/span[3]").InnerText, Url = new Uri("http://www.yousuu.com" + bookinfo.SelectSingleNode("div/div/div/div[3]/a").Attributes["href"].Value) }; lock (bookList) { if (!bookList.Contains(book)) { count++; bookList.Add(book); //将数据加入到泛型列表 } } //Console.WriteLine(book.ToString());//将书籍信息显示到控制台 } }
/// <summary> /// 使用正则表达式解析数据 /// </summary> /// <param name="doc"></param> /// <param name="count"></param> private static void ParseDataWithRegex(IHtmlDocument doc, ref int count) { // 使用正则表达式清洗网页源代码中的数据 string pattern = "<div class=\"title\"><a href=\"(?<href>/book/[\\d]+)\" target=\"_blank\">(?<title>[\\w\\s《》—]+)</a></div><div class=\"rating\"><span class=\"allstar00\"></span><span class=\"rating_nums\"></span><span>\\((?<ratenumber>\\w+)评价\\)</span></div><div class=\"abstract\">作者:\\s*(?<author>[\\w\\s-]+)\\s*<br />字数:\\s*(?<wordnumber>[\\w\\.]+)\\s*<br />最后更新:\\s*(?<updatetime>[\\w\\s]+)<br />综合评分:\\s*<span class=\"num2star\">(?<score>[\\w\\.]+)</span></div>"; var links = Regex.Matches(doc.InnerHtml().ToString(), pattern, RegexOptions.IgnoreCase); foreach (Match match in links) { var book = new Book { Title = match.Groups["title"].Value, Author = match.Groups["author"].Value, WordNumber = match.Groups["wordnumber"].Value, UpdateTime = match.Groups["updatetime"].Value, Score = match.Groups["score"].Value, RateNumber = match.Groups["ratenumber"].Value, Url = new Uri("http://www.yousuu.com" + match.Groups["href"].Value) }; lock (bookList) { if (!bookList.Contains(book)) { count++; bookList.Add(book); //将数据加入到泛型列表 } } //Console.WriteLine(book.ToString());//将书籍信息显示到控制台 } }