private void GetNodeText(HtmlNode node, StringBuilder result, HtmlNodeCollection inputs) { var text = node.InnerText; if (!string.IsNullOrEmpty(text)) { result.Append(text.Trim()); } else { if (inputs.Contains(node)) { var att = node.Attributes["value"]; if (att != null) { result.Append(" " + att.Value.Trim()); } } } }
public string Parse(string html) { HtmlNode root = GetRoot(html); var result = new StringBuilder(); HtmlNodeCollection inputs = root.SelectNodes("//input"); HtmlNodeCollection brs = root.SelectNodes("//br"); HtmlNodeCollection divs = root.SelectNodes("//div"); foreach (HtmlNode node in root.DescendantNodes()) { if (!node.HasChildNodes) { GetNodeText(node, result, inputs); } if (brs != null && brs.Contains(node) || (divs != null && divs.Contains(node))) { result.AppendLine(); } } return(result.ToString()); }
private void ParseBookHtml(BookInfo bookInfo, string html) { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); #region BookName var nodeList = doc.DocumentNode.SelectNodes("//span[@property='v:itemreviewed']"); if (nodeList != null && nodeList.Count < 1) { return; } bookInfo.BookName = nodeList[0].InnerText; #endregion #region Info nodeList = doc.DocumentNode.SelectNodes("//div[@id='info']"); HtmlNodeCollection infoes = null; if (nodeList != null && nodeList.Count >= 1) { infoes = nodeList[0].SelectNodes("//span[@class='pl']"); } if (infoes != null && infoes.Count > 0) { foreach (var info in infoes) { string key = info.InnerText.Trim(); string value = string.Empty; HtmlNode nextNode = info.NextSibling; while (nextNode != null && !infoes.Contains(nextNode)) { string innerContent = nextNode.InnerText.Trim(); if (!string.IsNullOrEmpty(innerContent)) { if (string.IsNullOrEmpty(value)) { value = innerContent; } else { value = string.Format("{0}{1}", value, innerContent); } } nextNode = nextNode.NextSibling; } switch (key) { case "作者": bookInfo.Author = value.TrimStart(':'); break; case "出版社:": bookInfo.Publisher = value; break; case "原作名:": if (string.IsNullOrEmpty(bookInfo.Author)) { bookInfo.Author = value; } break; case "译者": break; case "出版年:": bookInfo.PublishDate = value; break; case "页数:": bookInfo.PageNum = value; break; case "定价:": bookInfo.Price = value; break; case "ISBN:": bookInfo.ISBN = value; break; default: break; } } } #endregion #region Score HtmlNode averageNode = doc.DocumentNode.SelectSingleNode("//strong[@property='v:average']"); if (averageNode != null) { float average; if (float.TryParse(averageNode.InnerText, out average)) { bookInfo.AverageScore = average; } } HtmlNode voteNode = doc.DocumentNode.SelectSingleNode("//span[@property='v:votes']"); if (voteNode != null) { int vote; if (int.TryParse(voteNode.InnerText, out vote)) { bookInfo.RatingNum = vote; } } var starNodeList = doc.DocumentNode.SelectNodes("//span[@class='rating_per']"); int starNum = 0; if (starNodeList != null) { foreach (HtmlNode starNode in starNodeList) { if (starNum >= 5) { break; } string star = starNode.InnerText.TrimEnd('%'); float dStar; if (float.TryParse(star, out dStar)) { SetStar(bookInfo, starNum, dStar / 100f); } starNum++; } } #endregion #region Intro var introNodeList = doc.DocumentNode.SelectNodes("//div[@class='intro']"); if (introNodeList != null && introNodeList.Count >= 2) { bookInfo.ContentDescription = introNodeList[0].InnerText; bookInfo.AuthorDescription = introNodeList[1].InnerText; } #endregion #region Tag HtmlNodeCollection tagNodeLists = doc.DocumentNode.SelectNodes("//a[@class=' tag']"); StringBuilder tagStringBuilder = new StringBuilder(); if (tagNodeLists != null) { foreach (var tagNode in tagNodeLists) { string tag = tagNode.InnerText.Trim(); if (!string.IsNullOrEmpty(tag)) { tagStringBuilder.AppendFormat("{0},", tag); } } } bookInfo.Tags = tagStringBuilder.ToString().TrimEnd(','); #endregion }