private IList <Comment> GetComments(string requestUrl) { IList <Comment> comments = new List <Comment>(); string html = ArticleParserHelper.GetHtmlStr(requestUrl); try { string result = Regex.Unescape(html); //Translate unicode to Chinese characters string jsonResult = CleanJsonString(result); JObject obj = JObject.Parse(jsonResult); IList <JToken> jsonComments = obj["data"]["commentid"].Children().ToList(); foreach (JToken token in jsonComments) { NewsComment_QQ cmt = JsonConvert.DeserializeObject <NewsComment_QQ>(token.ToString()); comments.Add(new Comment() { Cotent = cmt.Content, Vote = Int32.Parse(cmt.Up) }); } } catch (Exception) { return(comments); } return(comments); }
public override IList <NewsItem> GetNewsList() { IList <NewsItem> newsList = new List <NewsItem>(); HtmlDocument doc = ArticleParserHelper.GetHtmlDoc(siteUrl); if (null != doc) { HtmlNode rootNode = doc.DocumentNode; //Get top 1 heading list //this.GetHeadingNewsList(rootNode, newsList); //Get 2nd heading list //this.GetHeadingNewsList(rootNode, newsList, "//div[@id='headingNews']/div[@class='hdNews hasPic cf']"); //Get top news in main page with one pic this.GetHotNewsList(rootNode, newsList, "//div[@class='item major']/div[@class='Q-tpList']"); //Get hot top new in main page with multi pics this.GetHotNewsList(rootNode, newsList, "//div[@class='item major']/div[@class='Q-pList']"); } return(newsList.GroupBy(n => n.BaseUrl).Select(g => g.First()).ToList() as IList <NewsItem>); //Distinct news }
private string[] GetNewsBodyText(string url) { string firstpara = string.Empty; string content = string.Empty; HtmlDocument doc = ArticleParserHelper.GetHtmlDoc(url); if (null != doc) { HtmlNode rootNode = doc.DocumentNode; string xpathhdNews = "//div[@class='article-content']//p"; HtmlNodeCollection newshdCollection = rootNode.SelectNodes(xpathhdNews); if (null == newshdCollection) { return(new string[] { firstpara, content }); } foreach (HtmlNode wraperNode in newshdCollection) { if (!string.IsNullOrEmpty(wraperNode.InnerText.Trim())) { firstpara = wraperNode.InnerText.Trim(); break; } } foreach (HtmlNode wraperNode in newshdCollection) { content += wraperNode.InnerText.Trim(); } } return(new string[] { firstpara, content }); }
public List <JToken> GetJsonNewsList() { string max_behot_time = string.Empty; string uri = string.Empty; string jsonResult = string.Empty; int onceGet = InitInfo.ToutiaoOnceGetMaxCount; List <JToken> jsonNewsList = new List <JToken>(); for (int i = 0; i < webSetting.CrawNewsCount / onceGet; i++) { uri = string.Format(SiteNewsUrl, max_behot_time, onceGet); jsonResult = ArticleParserHelper.GetHtmlStr(uri); JObject obj = JObject.Parse(jsonResult); jsonNewsList.AddRange(obj["data"].Children().ToList()); max_behot_time = obj["next"]["max_behot_time"].ToString(); } if (webSetting.CrawNewsCount % onceGet > 0) { uri = string.Format(SiteNewsUrl, max_behot_time, webSetting.CrawNewsCount % onceGet); jsonResult = ArticleParserHelper.GetHtmlStr(uri); JObject obj = JObject.Parse(jsonResult); jsonNewsList.AddRange(obj["data"].Children().ToList()); } return(jsonNewsList); }
public override IList <Comment> GetComments(NewsItem state) { List <Comment> comments = new List <Comment>(); HtmlDocument doc = ArticleParserHelper.GetHtmlDoc(state.CommentUrl); if (null != doc) { HtmlNode rootNode = doc.DocumentNode; string xpathhdNews = "//div[@class='comment-content']"; HtmlNodeCollection newshdCollection = rootNode.SelectNodes(xpathhdNews); if (null == newshdCollection) { return(comments); } foreach (HtmlNode wraperNode in newshdCollection) { Comment c = new Comment(); HtmlNode c_content = wraperNode.SelectSingleNode("./div[@class='content']"); c.Cotent = c_content.InnerText.Trim(); HtmlNode c_vote = wraperNode.SelectSingleNode("./div[@class='comment_actions clearfix']/span[@class='action']/a[@class='comment_digg ']"); c.Vote = Convert.ToInt32(c_vote.InnerText.Trim()); comments.Add(c); } } return(comments); }
/// <summary> /// Crawl news keywords, first paragraph and body text. /// </summary> /// <param name="news">Adding more info to the news item.</param> private void GetMoreNewsInfo(NewsItem news) { if (null == news) { news = new NewsItem(); } System.Threading.Thread.Sleep(1000); //delayed request is required to unblock from server side Console.WriteLine("Crawling url: {0}", news.BaseUrl); HtmlDocument doc = ArticleParserHelper.GetHtmlDoc(news.BaseUrl); if (null == doc) { return; } HtmlNode rootNode = doc.DocumentNode; news.Keywords = GetNewsKeywords(rootNode, " "); string[] content = GetNewsContent(rootNode); news.FirstPara = content[0]; news.BodyText = content[1]; }
/// <summary> /// Crawl news and comments,save file to local file /// </summary> public void Crawl() { string content = string.Empty; if (!GetCrawlFlag()) { return; } bool isCrawled = false; int tryCount = 0; while (!isCrawled && tryCount++ < InitInfo.RetryCount) { InitInfo.LogMessage(string.Format("Starting to crawl {0} site...", GetSiteName())); try { IList <NewsItem> news = GetNewsList(); if (news != null && news.Count > 0) { switch (Path.GetExtension(GetFileName()).ToLower()) { case ".json": content = CrawlAsJson(news); break; case ".xml": content = CrawlAsXml(news); break; default: content = CrawlAsXml(news); break; } if (!string.IsNullOrEmpty(content)) { isCrawled = true; } } } catch (Exception ex) { InitInfo.LogMessage("Failed to crawl. Exceptoin: " + ex.GetBaseException().ToString()); } if (!isCrawled) { InitInfo.LogMessage("Failed to crawl. Retry after 10 seconds..."); System.Threading.Thread.Sleep(10 * 1000); continue; } else { string fileName = GetFileName(); InitInfo.LogMessage("Save file to " + fileName); ArticleParserHelper.SaveToLocalFile(fileName, content); ArticleParserHelper.CopyFileAndUpdateLatestFile(fileName); } } if (!isCrawled) { InitInfo.LogMessage("Error occurs when crawl news and comments!"); } }