public override IList <NewsItem> GetNewsList() { IList <NewsItem> newsList = new List <NewsItem>(); HtmlDocument doc = ArticleParserHelper.GetHtmlDoc(siteUrl); if (null != doc) { HtmlNode rootNode = doc.DocumentNode; //Get top 1 heading list //this.GetHeadingNewsList(rootNode, newsList); //Get 2nd heading list //this.GetHeadingNewsList(rootNode, newsList, "//div[@id='headingNews']/div[@class='hdNews hasPic cf']"); //Get top news in main page with one pic this.GetHotNewsList(rootNode, newsList, "//div[@class='item major']/div[@class='Q-tpList']"); //Get hot top new in main page with multi pics this.GetHotNewsList(rootNode, newsList, "//div[@class='item major']/div[@class='Q-pList']"); } return(newsList.GroupBy(n => n.BaseUrl).Select(g => g.First()).ToList() as IList <NewsItem>); //Distinct news }
private string[] GetNewsBodyText(string url) { string firstpara = string.Empty; string content = string.Empty; HtmlDocument doc = ArticleParserHelper.GetHtmlDoc(url); if (null != doc) { HtmlNode rootNode = doc.DocumentNode; string xpathhdNews = "//div[@class='article-content']//p"; HtmlNodeCollection newshdCollection = rootNode.SelectNodes(xpathhdNews); if (null == newshdCollection) { return(new string[] { firstpara, content }); } foreach (HtmlNode wraperNode in newshdCollection) { if (!string.IsNullOrEmpty(wraperNode.InnerText.Trim())) { firstpara = wraperNode.InnerText.Trim(); break; } } foreach (HtmlNode wraperNode in newshdCollection) { content += wraperNode.InnerText.Trim(); } } return(new string[] { firstpara, content }); }
public override IList <Comment> GetComments(NewsItem state) { List <Comment> comments = new List <Comment>(); HtmlDocument doc = ArticleParserHelper.GetHtmlDoc(state.CommentUrl); if (null != doc) { HtmlNode rootNode = doc.DocumentNode; string xpathhdNews = "//div[@class='comment-content']"; HtmlNodeCollection newshdCollection = rootNode.SelectNodes(xpathhdNews); if (null == newshdCollection) { return(comments); } foreach (HtmlNode wraperNode in newshdCollection) { Comment c = new Comment(); HtmlNode c_content = wraperNode.SelectSingleNode("./div[@class='content']"); c.Cotent = c_content.InnerText.Trim(); HtmlNode c_vote = wraperNode.SelectSingleNode("./div[@class='comment_actions clearfix']/span[@class='action']/a[@class='comment_digg ']"); c.Vote = Convert.ToInt32(c_vote.InnerText.Trim()); comments.Add(c); } } return(comments); }
/// <summary> /// Crawl news keywords, first paragraph and body text. /// </summary> /// <param name="news">Adding more info to the news item.</param> private void GetMoreNewsInfo(NewsItem news) { if (null == news) { news = new NewsItem(); } System.Threading.Thread.Sleep(1000); //delayed request is required to unblock from server side Console.WriteLine("Crawling url: {0}", news.BaseUrl); HtmlDocument doc = ArticleParserHelper.GetHtmlDoc(news.BaseUrl); if (null == doc) { return; } HtmlNode rootNode = doc.DocumentNode; news.Keywords = GetNewsKeywords(rootNode, " "); string[] content = GetNewsContent(rootNode); news.FirstPara = content[0]; news.BodyText = content[1]; }