//get article from net private static Article GetArticle(string link) { Article article = new Article(); string url = Global.RELATIVE_URL + link; Book book = new Book(); HtmlDocument doc = ParserUtil.GetHtmlDocument(url); HtmlNode html = doc.DocumentNode; HtmlNode shileft = html.CssSelect("div.shileft").First(); article.Title = ParserUtil.DeleteUselessChar(shileft.CssSelect("div.son1").First().CssSelect("h1").First().InnerText); HtmlNode bookson2 = shileft.CssSelect("div.bookvson2").First(); HtmlNode firstp = bookson2.CssSelect("p").First(); bookson2.RemoveChild(firstp); article.Content = ParserUtil.DeleteUselessChar(bookson2.InnerHtml); List<HtmlNode> son5s = shileft.CssSelect("div.son5").ToList(); string extLink = null; foreach (var son5 in son5s) { extLink = son5.CssSelect("a").First().Attributes["href"].Value.ToString(); ArticleExt ext = GetArticleExt(extLink); article.ArticleExt.Add(ext); } return article; }
private static Book GetBook(int i) { Book book = new Book(); string url = string.Format(BOOK_URL, i); HtmlDocument doc = ParserUtil.GetHtmlDocument(url); HtmlNode html = doc.DocumentNode; HtmlNode shileft = html.CssSelect("div.shileft").First(); book.Name = ParserUtil.DeleteUselessChar(shileft.CssSelect("div.son1").First().CssSelect("h1").First().InnerText); HtmlNode son2 = shileft.CssSelect("div.son2").First(); book.authorName = ParserUtil.DeleteUselessChar(son2.CssSelect("p").First().InnerText); book.authorName = book.authorName.Replace("作者:", ""); son2.RemoveChild(son2.CssSelect("div.pingfen").First()); son2.RemoveChild(son2.CssSelect("p").First()); book.Intro = ParserUtil.DeleteUselessChar(son2.InnerText); //一个bookcontent可能是一个部分(bookpart),也有可能是一整个book的内容 List<HtmlNode> parts = html.CssSelect("div.bookcont").ToList(); if (parts.Count > 1)//this book have some parts { for (int k = 0; k < parts.Count; k++) { BookPart p = new BookPart(); p.Name = ParserUtil.DeleteUselessChar(parts[k].CssSelect("div.bookMl").First().CssSelect("strong").First().InnerText); List<string> articlesLinks = parts[k].CssSelect("a").Select(t => t.Attributes["href"].Value.ToString()).ToList(); for (int j = 0; j < articlesLinks.Count; j++) { Article article = GetArticle(articlesLinks[j]); article.TheSequnce = j; p.Article.Add(article); } p.TheSequnce = k; book.BookPart.Add(p); } } else //this book doesn't has part { List<string> articlesLinks = parts.First().CssSelect("a").Select(t => t.Attributes["href"].Value.ToString()).ToList(); for (int j = 0; j < articlesLinks.Count; j++) { Article article = GetArticle(articlesLinks[j]); article.TheSequnce = j; book.Article.Add(article); } } return book; }