Beispiel #1
0
        public override IList <NewsItem> GetNewsList()
        {
            IList <NewsItem> newsList = new List <NewsItem>();
            HtmlDocument     doc      = ArticleParserHelper.GetHtmlDoc(siteUrl);

            if (null != doc)
            {
                HtmlNode rootNode = doc.DocumentNode;

                //Get top 1 heading list
                //this.GetHeadingNewsList(rootNode, newsList);

                //Get 2nd heading list
                //this.GetHeadingNewsList(rootNode, newsList, "//div[@id='headingNews']/div[@class='hdNews hasPic cf']");

                //Get top news in main page with one pic
                this.GetHotNewsList(rootNode, newsList,
                                    "//div[@class='item major']/div[@class='Q-tpList']");

                //Get hot top new in main page with multi pics
                this.GetHotNewsList(rootNode, newsList,
                                    "//div[@class='item major']/div[@class='Q-pList']");
            }

            return(newsList.GroupBy(n => n.BaseUrl).Select(g => g.First()).ToList() as IList <NewsItem>);  //Distinct news
        }
Beispiel #2
0
        private string[] GetNewsBodyText(string url)
        {
            string       firstpara = string.Empty;
            string       content   = string.Empty;
            HtmlDocument doc       = ArticleParserHelper.GetHtmlDoc(url);

            if (null != doc)
            {
                HtmlNode rootNode    = doc.DocumentNode;
                string   xpathhdNews = "//div[@class='article-content']//p";

                HtmlNodeCollection newshdCollection = rootNode.SelectNodes(xpathhdNews);
                if (null == newshdCollection)
                {
                    return(new string[] { firstpara, content });
                }

                foreach (HtmlNode wraperNode in newshdCollection)
                {
                    if (!string.IsNullOrEmpty(wraperNode.InnerText.Trim()))
                    {
                        firstpara = wraperNode.InnerText.Trim();
                        break;
                    }
                }

                foreach (HtmlNode wraperNode in newshdCollection)
                {
                    content += wraperNode.InnerText.Trim();
                }
            }

            return(new string[] { firstpara, content });
        }
Beispiel #3
0
        public override IList <Comment> GetComments(NewsItem state)
        {
            List <Comment> comments = new List <Comment>();

            HtmlDocument doc = ArticleParserHelper.GetHtmlDoc(state.CommentUrl);

            if (null != doc)
            {
                HtmlNode rootNode    = doc.DocumentNode;
                string   xpathhdNews = "//div[@class='comment-content']";

                HtmlNodeCollection newshdCollection = rootNode.SelectNodes(xpathhdNews);
                if (null == newshdCollection)
                {
                    return(comments);
                }

                foreach (HtmlNode wraperNode in newshdCollection)
                {
                    Comment  c         = new Comment();
                    HtmlNode c_content = wraperNode.SelectSingleNode("./div[@class='content']");
                    c.Cotent = c_content.InnerText.Trim();
                    HtmlNode c_vote = wraperNode.SelectSingleNode("./div[@class='comment_actions clearfix']/span[@class='action']/a[@class='comment_digg ']");
                    c.Vote = Convert.ToInt32(c_vote.InnerText.Trim());

                    comments.Add(c);
                }
            }

            return(comments);
        }
Beispiel #4
0
        /// <summary>
        /// Crawl news keywords, first paragraph and body text.
        /// </summary>
        /// <param name="news">Adding more info to the news item.</param>
        private void GetMoreNewsInfo(NewsItem news)
        {
            if (null == news)
            {
                news = new NewsItem();
            }

            System.Threading.Thread.Sleep(1000);  //delayed request is required to unblock from server side
            Console.WriteLine("Crawling url: {0}", news.BaseUrl);
            HtmlDocument doc = ArticleParserHelper.GetHtmlDoc(news.BaseUrl);

            if (null == doc)
            {
                return;
            }

            HtmlNode rootNode = doc.DocumentNode;

            news.Keywords = GetNewsKeywords(rootNode, " ");

            string[] content = GetNewsContent(rootNode);
            news.FirstPara = content[0];
            news.BodyText  = content[1];
        }