public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            var articleElement = new ArticleElementInfo();

            var doc = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//div[@class=\"chitiettin\"]", ".//h1");
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//div[@class=\"noidungchitiet\"]", ".//h2");
            //author
            articleElement.Author = null;// Utilities._GetNode(doc, "//p[@class=\"tacgia\"]", ".//span");
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@class=\"noidungchitiet\"]");

            if (articleElement.Content != null)
            {
                string className = articleElement.Excerpt.Attributes["class"].Value;
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//h2[@class=\"" + className + "\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"thongtingame\"]");

                //get large image as thumbnail
                articleElement.Image = Utilities._GetNode(doc, "//div[@class=\"chitiettin\"]", ".//img");
            }
            return(articleElement);
        }
Exemple #2
0
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            var articleElement = new ArticleElementInfo();

            var doc = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//span[@class=\"news_title\"]");
            //excerpt
            //articleElement.Excerpt = Utilities._GetNode(doc, "//td[@class=\"news_content\"]", ".//strong");
            //author
            articleElement.Author = null;
            //content
            articleElement.Content = Utilities._GetNode(doc, "//td[@class=\"news_content\"]");
            if (articleElement.Content != null)
            {
                //get large image as thumbnail
                articleElement.Image = articleElement.Content.SelectSingleNode(".//img[1]");

                if (articleElement.Content.SelectSingleNode(".//p[1]").InnerText.Trim() != string.Empty)
                {
                    articleElement.Excerpt = articleElement.Content.SelectSingleNode(".//p[1]");

                    articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//*[@class=\"news_content\"]/p[1]");
                }
                else
                {
                    articleElement.Excerpt = articleElement.Content.SelectSingleNode(".//p[2]");

                    articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//*[@class=\"news_content\"]/p[1]");
                    articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//*[@class=\"news_content\"]/p[2]");
                }
            }
            return(articleElement);
        }
Exemple #3
0
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            var articleElement = new ArticleElementInfo();

            var doc = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//h1[@class=\"title\"]");
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//h2[@class=\"sapo\"]");
            //author
            articleElement.Author = null;// Utilities._GetNode(doc, "//p[@class=\"p-author\"]");
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@class=\"detail-content\"]");
            if (articleElement.Content != null)
            {
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@type=\"link-content-footer\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//p[@class=\"p-author\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//p[@class=\"p-source\"]");

                //get large image as thumbnail
                articleElement.Image = Utilities._GetNode(doc, "//img[@class=\"img\"]");
            }
            return(articleElement);
        }
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            Uri    myUri     = new Uri(articleUrl);
            string reqSource = myUri.Host;

            if (!string.IsNullOrEmpty(reqSource))
            {
                reqSource = reqSource.Replace(".", "_").ToLower().Trim();
            }

            var articleElement = new ArticleElementInfo();
            var doc            = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//h1[@class=\"title_detail\"]", string.Empty);
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//div[@class=\"sapo_detail\"]", string.Empty);
            //author
            articleElement.Author = Utilities._GetNode(doc, "//p[@class=\"align-right\"]", string.Empty);
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@id=\"content_detail_news\"]", string.Empty);
            //publishTime
            articleElement.PublishedTime = Utilities._GetNode(doc, "//span[@class=\"time_index\"]", string.Empty);
            //keyword
            articleElement.Keyword = Utilities._GetNodes(doc, "//div[@class=\"tag_detail\"]//a//h3", string.Empty);
            //relation article

            if (articleElement.Content != null)
            {
                //get large image as thumbnail
                articleElement.Image   = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"bs-inread-container-wrapper\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//p[@class=\"align-right\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//p[@style=\"text-align: right;\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@style=\"display: inline-block;width: 100%;overflow: hidden;\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"share_detail pkg\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"content_detail_news\"]//script");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"content_detail_news\"]//div[@style=\"display: inline-block;width: 100%;overflow: hidden;float: left;\"]");
            }

            return(articleElement);
        }
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            Uri    myUri     = new Uri(articleUrl);
            string reqSource = myUri.Host;

            if (!string.IsNullOrEmpty(reqSource))
            {
                reqSource = reqSource.Replace(".", "_").ToLower().Trim();
            }

            var articleElement = new ArticleElementInfo();
            var doc            = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//h1", string.Empty);
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//h2", string.Empty);
            //author
            articleElement.Author = Utilities._GetNode(doc, "//span[@itemprop=\"author\"]", string.Empty);
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@class=\"post-content fs15content pb10 pt10\"]", string.Empty);
            //publishTime
            articleElement.PublishedTime = Utilities._GetNode(doc, "//span[@class=\"time f-elle-futura-book hidden-sm hidden-xs\"]", string.Empty);
            //keyword
            articleElement.Keyword = Utilities._GetNodes(doc, "//div[@class=\"col-md-10\"]//a[@class=\"tarhome fs10\"]", string.Empty);
            //relation article

            if (articleElement.Content != null)
            {
                //get large image as thumbnail
                articleElement.Image   = articleElement.Content.SelectSingleNode(".//img[1]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, ".//p//small");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, ".//div[@class=\"mb10 mt10\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, ".//div[@class=\"row related-post-detail hidden-sm hidden-xs\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, ".//div[@style=\"height: 1px; width: 1px; display: none;\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, ".//p//iframe");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, ".//iframe");
            }

            return(articleElement);
        }
Exemple #6
0
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            var articleElement = new ArticleElementInfo();

            var doc = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//h1[@id=\"title\"]");
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//div[@class=\"content-article\"]", ".//div[@class=\"lead\"]");
            //author
            articleElement.Author = null;// Utilities._GetNode(doc, "//p[@class=\"author\"]");
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@id=\"content\"]");
            if (articleElement.Content != null)
            {
                //get large image as thumbnail
                articleElement.Image = articleElement.Content.SelectSingleNode(".//img[1]");
            }
            return(articleElement);
        }
Exemple #7
0
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            var articleElement = new ArticleElementInfo();

            var doc = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//h1[@class=\"baiviet-title\"]");
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//p[@class=\"baiviet-sapo\"]");
            //author
            articleElement.Author = null;// Utilities._GetNode(doc, "//div[@class=\"nguontin\"]");
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@class=\" text-conent\"]");
            if (articleElement.Content != null)
            {
                //get large image as thumbnail
                articleElement.Image = Utilities._GetNode(doc, "//img[@class=\"news-image\"]");
            }
            return(articleElement);
        }
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            var articleElement = new ArticleElementInfo();

            var doc = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//h1[@id=\"title\"]");
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//*[@id=\"content\"]/div[1]");
            //author
            articleElement.Author = null;
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@id=\"content\"]");

            //publishtime
            var pubdate = Utilities._GetNodeInnerText(doc, "//div[@id=\"date\"]");
            var matches = System.Text.RegularExpressions.Regex.Matches(pubdate, @"(.+)\s(\d+)\/(\d+)\/(\d+),\s(\d+):(\d+)(.+)$", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            if (matches.Count > 0)
            {
                pubdate = string.Format("{0}-{1}-{2} {3}:{4}:00", matches[0].Groups[4].Value, matches[0].Groups[3].Value, matches[0].Groups[2].Value, matches[0].Groups[5].Value, matches[0].Groups[6].Value);
            }
            articleElement.PublishedTime = HtmlNode.CreateNode("<span>" + pubdate + "</span>");

            if (articleElement.Content != null)
            {
                //get large image as thumbnail
                articleElement.Image = articleElement.Content.SelectSingleNode(".//img[1]");

                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//*[@id=\"content\"]/p[1]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//*[@id=\"content\"]/p[2]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//*[@id=\"content\"]/p[3]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//*[@id=\"content\"]/div[1]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, ".//table[@class=\"rl box leftside\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, ".//table[@class=\"rl center\"]");
            }
            return(articleElement);
        }
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            var articleElement = new ArticleElementInfo();

            var doc = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//h1[@class=\"title\"]");
            //excerpt
            var    desNode   = Utilities._GetNode(doc, "//h2[@class=\"lead\"]");
            string desc_text = desNode.InnerText;

            if (desc_text.Contains("-"))
            {
                desc_text = desc_text.Substring(desc_text.IndexOf("-") + 1).Trim();
            }
            articleElement.Excerpt = Utilities._CreateNodeFromString(desc_text);
            //author
            articleElement.Author = null;
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@id=\"detail\"]");
            if (articleElement.Content != null)
            {
                //get large image as thumbnail
                articleElement.Image   = articleElement.Content.SelectSingleNode(".//img[1]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//*[@class=\"bar-left_th\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//h1[@class=\"title\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//h2[@class=\"lead\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//ul[@class=\"ul_relate\"]");

                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"AdAsia\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"itvcplayer\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//ins");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@style=\"height:30px;margin-right:10px;float:right\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "(//center)[last()]");
            }
            return(articleElement);
        }
Exemple #10
0
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            var articleElement = new ArticleElementInfo();

            var doc = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//h1[@id=\"title-h1\"]");
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//div[@class=\"sapo-news-detail\"]", "//h2");
            //author
            articleElement.Author = null;
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@id=\"content-id\"]");
            if (articleElement.Content != null)
            {
                //get large image as thumbnail
                articleElement.Image = articleElement.Content.SelectSingleNode(".//img[1]");
                articleElement.Content.SelectSingleNode(".//img[1]").Remove();
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"sapo-news-detail\"]");
            }
            return(articleElement);
        }
Exemple #11
0
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            Uri    myUri     = new Uri(articleUrl);
            string reqSource = myUri.Host;

            if (!string.IsNullOrEmpty(reqSource))
            {
                reqSource = reqSource.Replace(".", "_").ToLower().Trim();
            }

            var articleElement = new ArticleElementInfo();
            var doc            = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//h1[@class=\"title\"]", string.Empty);
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//h2[@class=\"sapo\"]", string.Empty);
            //author
            articleElement.Author = Utilities._GetNode(doc, "//p[@class=\"dateandcat\"]", string.Empty);
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@class=\"contentdetail\"]", string.Empty);
            //publishTime
            articleElement.PublishedTime = Utilities._GetNode(doc, "//p[@class=\"dateandcat\"]//span", string.Empty);
            //keyword
            articleElement.Keyword = Utilities._GetNodes(doc, "undefined", string.Empty);
            //relation article

            if (articleElement.Content != null)
            {
                //get large image as thumbnail
                articleElement.Image   = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//p//strong//span[@class=\"entity _586o\"]");
            }

            return(articleElement);
        }
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            Uri    myUri     = new Uri(articleUrl);
            string reqSource = myUri.Host;

            if (!string.IsNullOrEmpty(reqSource))
            {
                reqSource = reqSource.Replace(".", "_").ToLower().Trim();
            }

            var articleElement = new ArticleElementInfo();
            var doc            = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//h1", string.Empty);
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//p[@class=\"the-article-summary cms-desc\"]", string.Empty);
            //author
            articleElement.Author = Utilities._GetNode(doc, "//div[@class=\"the-article-credit\"]//p[@class=\"author\"]", string.Empty);
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@class=\"the-article-body cms-body\"]", string.Empty);
            //publishTime
            articleElement.PublishedTime = Utilities._GetNode(doc, "//ul[@class=\"the-article-meta\"]//li[@class=\"the-article-publish cms-date\"]", string.Empty);
            //keyword
            articleElement.Keyword = Utilities._GetNodes(doc, "//p[@class=\"the-article-tags\"]//a", string.Empty);
            //relation article

            if (articleElement.Content != null)
            {
                //get large image as thumbnail
                articleElement.Image = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]");
                //articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, ".//p//small");
            }

            return(articleElement);
        }
Exemple #13
0
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            Uri    myUri     = new Uri(articleUrl);
            string reqSource = myUri.Host;

            if (!string.IsNullOrEmpty(reqSource))
            {
                reqSource = reqSource.Replace(".", "_").ToLower().Trim();
            }

            var articleElement = new ArticleElementInfo();
            var doc            = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//div[@class=\"details-wrap\"]//h1", string.Empty);
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//div[@class=\"sapo cms-desc\"]", string.Empty);
            //author
            articleElement.Author = Utilities._GetNode(doc, "//h4[@class=\"name cms-author\"]", string.Empty);
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@id=\"main_detail\"]", string.Empty);
            //publishTime
            articleElement.PublishedTime = Utilities._GetNode(doc, "//time[@class=\"cms-date\"]", string.Empty);
            //keyword
            articleElement.Keyword = Utilities._GetNodes(doc, "//ul[@class=\"tags clearfix\"]//li//a", string.Empty);
            //relation article

            if (articleElement.Content != null)
            {
                //get large image as thumbnail
                articleElement.Image   = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"LavaNetwork\"]");
            }

            return(articleElement);
        }
Exemple #14
0
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            Uri    myUri     = new Uri(articleUrl);
            string reqSource = myUri.Host;

            if (!string.IsNullOrEmpty(reqSource))
            {
                reqSource = reqSource.Replace(".", "_").ToLower().Trim();
            }

            var articleElement = new ArticleElementInfo();
            var doc            = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//article//h1", string.Empty);
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//div[@class=\"news_desc\"]//h2", string.Empty);
            //author
            articleElement.Author = Utilities._GetNode(doc, "//a[@class=\"author\"]", string.Empty);
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@class=\"box_content_detail\"]", string.Empty);
            //publishTime
            articleElement.PublishedTime = Utilities._GetNode(doc, "//span[@class=\"time_up\"]");
            //keyword
            articleElement.Keyword = Utilities._GetNodes(doc, "//div[@class=\"box_tags\"]//a", string.Empty);
            //relation article

            if (articleElement.Content != null)
            {
                //get large image as thumbnail
                articleElement.Image   = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"box_retale_detail_delay\"]");
            }

            return(articleElement);
        }