예제 #1
0
        private IList <Comment> GetComments(string requestUrl)
        {
            IList <Comment> comments = new List <Comment>();
            string          html     = ArticleParserHelper.GetHtmlStr(requestUrl);

            try
            {
                string         result       = Regex.Unescape(html); //Translate unicode to Chinese characters
                string         jsonResult   = CleanJsonString(result);
                JObject        obj          = JObject.Parse(jsonResult);
                IList <JToken> jsonComments = obj["data"]["commentid"].Children().ToList();
                foreach (JToken token in jsonComments)
                {
                    NewsComment_QQ cmt = JsonConvert.DeserializeObject <NewsComment_QQ>(token.ToString());
                    comments.Add(new Comment()
                    {
                        Cotent = cmt.Content,
                        Vote   = Int32.Parse(cmt.Up)
                    });
                }
            }
            catch (Exception)
            {
                return(comments);
            }

            return(comments);
        }
예제 #2
0
        public override IList <NewsItem> GetNewsList()
        {
            IList <NewsItem> newsList = new List <NewsItem>();
            HtmlDocument     doc      = ArticleParserHelper.GetHtmlDoc(siteUrl);

            if (null != doc)
            {
                HtmlNode rootNode = doc.DocumentNode;

                //Get top 1 heading list
                //this.GetHeadingNewsList(rootNode, newsList);

                //Get 2nd heading list
                //this.GetHeadingNewsList(rootNode, newsList, "//div[@id='headingNews']/div[@class='hdNews hasPic cf']");

                //Get top news in main page with one pic
                this.GetHotNewsList(rootNode, newsList,
                                    "//div[@class='item major']/div[@class='Q-tpList']");

                //Get hot top new in main page with multi pics
                this.GetHotNewsList(rootNode, newsList,
                                    "//div[@class='item major']/div[@class='Q-pList']");
            }

            return(newsList.GroupBy(n => n.BaseUrl).Select(g => g.First()).ToList() as IList <NewsItem>);  //Distinct news
        }
예제 #3
0
        private string[] GetNewsBodyText(string url)
        {
            string       firstpara = string.Empty;
            string       content   = string.Empty;
            HtmlDocument doc       = ArticleParserHelper.GetHtmlDoc(url);

            if (null != doc)
            {
                HtmlNode rootNode    = doc.DocumentNode;
                string   xpathhdNews = "//div[@class='article-content']//p";

                HtmlNodeCollection newshdCollection = rootNode.SelectNodes(xpathhdNews);
                if (null == newshdCollection)
                {
                    return(new string[] { firstpara, content });
                }

                foreach (HtmlNode wraperNode in newshdCollection)
                {
                    if (!string.IsNullOrEmpty(wraperNode.InnerText.Trim()))
                    {
                        firstpara = wraperNode.InnerText.Trim();
                        break;
                    }
                }

                foreach (HtmlNode wraperNode in newshdCollection)
                {
                    content += wraperNode.InnerText.Trim();
                }
            }

            return(new string[] { firstpara, content });
        }
예제 #4
0
        public List <JToken> GetJsonNewsList()
        {
            string        max_behot_time = string.Empty;
            string        uri            = string.Empty;
            string        jsonResult     = string.Empty;
            int           onceGet        = InitInfo.ToutiaoOnceGetMaxCount;
            List <JToken> jsonNewsList   = new List <JToken>();

            for (int i = 0; i < webSetting.CrawNewsCount / onceGet; i++)
            {
                uri        = string.Format(SiteNewsUrl, max_behot_time, onceGet);
                jsonResult = ArticleParserHelper.GetHtmlStr(uri);
                JObject obj = JObject.Parse(jsonResult);

                jsonNewsList.AddRange(obj["data"].Children().ToList());
                max_behot_time = obj["next"]["max_behot_time"].ToString();
            }
            if (webSetting.CrawNewsCount % onceGet > 0)
            {
                uri        = string.Format(SiteNewsUrl, max_behot_time, webSetting.CrawNewsCount % onceGet);
                jsonResult = ArticleParserHelper.GetHtmlStr(uri);
                JObject obj = JObject.Parse(jsonResult);
                jsonNewsList.AddRange(obj["data"].Children().ToList());
            }

            return(jsonNewsList);
        }
예제 #5
0
        public override IList <Comment> GetComments(NewsItem state)
        {
            List <Comment> comments = new List <Comment>();

            HtmlDocument doc = ArticleParserHelper.GetHtmlDoc(state.CommentUrl);

            if (null != doc)
            {
                HtmlNode rootNode    = doc.DocumentNode;
                string   xpathhdNews = "//div[@class='comment-content']";

                HtmlNodeCollection newshdCollection = rootNode.SelectNodes(xpathhdNews);
                if (null == newshdCollection)
                {
                    return(comments);
                }

                foreach (HtmlNode wraperNode in newshdCollection)
                {
                    Comment  c         = new Comment();
                    HtmlNode c_content = wraperNode.SelectSingleNode("./div[@class='content']");
                    c.Cotent = c_content.InnerText.Trim();
                    HtmlNode c_vote = wraperNode.SelectSingleNode("./div[@class='comment_actions clearfix']/span[@class='action']/a[@class='comment_digg ']");
                    c.Vote = Convert.ToInt32(c_vote.InnerText.Trim());

                    comments.Add(c);
                }
            }

            return(comments);
        }
예제 #6
0
        /// <summary>
        /// Crawl news keywords, first paragraph and body text.
        /// </summary>
        /// <param name="news">Adding more info to the news item.</param>
        private void GetMoreNewsInfo(NewsItem news)
        {
            if (null == news)
            {
                news = new NewsItem();
            }

            System.Threading.Thread.Sleep(1000);  //delayed request is required to unblock from server side
            Console.WriteLine("Crawling url: {0}", news.BaseUrl);
            HtmlDocument doc = ArticleParserHelper.GetHtmlDoc(news.BaseUrl);

            if (null == doc)
            {
                return;
            }

            HtmlNode rootNode = doc.DocumentNode;

            news.Keywords = GetNewsKeywords(rootNode, " ");

            string[] content = GetNewsContent(rootNode);
            news.FirstPara = content[0];
            news.BodyText  = content[1];
        }
예제 #7
0
        /// <summary>
        /// Crawl news and comments,save file to local file
        /// </summary>
        public void Crawl()
        {
            string content = string.Empty;

            if (!GetCrawlFlag())
            {
                return;
            }

            bool isCrawled = false;
            int  tryCount  = 0;

            while (!isCrawled && tryCount++ < InitInfo.RetryCount)
            {
                InitInfo.LogMessage(string.Format("Starting to crawl {0} site...", GetSiteName()));
                try
                {
                    IList <NewsItem> news = GetNewsList();
                    if (news != null && news.Count > 0)
                    {
                        switch (Path.GetExtension(GetFileName()).ToLower())
                        {
                        case ".json":
                            content = CrawlAsJson(news);
                            break;

                        case ".xml":
                            content = CrawlAsXml(news);
                            break;

                        default:
                            content = CrawlAsXml(news);
                            break;
                        }

                        if (!string.IsNullOrEmpty(content))
                        {
                            isCrawled = true;
                        }
                    }
                }
                catch (Exception ex)
                {
                    InitInfo.LogMessage("Failed to crawl. Exceptoin: " + ex.GetBaseException().ToString());
                }

                if (!isCrawled)
                {
                    InitInfo.LogMessage("Failed to crawl. Retry after 10 seconds...");
                    System.Threading.Thread.Sleep(10 * 1000);
                    continue;
                }
                else
                {
                    string fileName = GetFileName();
                    InitInfo.LogMessage("Save file to " + fileName);
                    ArticleParserHelper.SaveToLocalFile(fileName, content);
                    ArticleParserHelper.CopyFileAndUpdateLatestFile(fileName);
                }
            }
            if (!isCrawled)
            {
                InitInfo.LogMessage("Error occurs when crawl news and comments!");
            }
        }