public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            Uri myUri = new Uri(articleUrl);
            string reqSource = myUri.Host;
            if (!string.IsNullOrEmpty(reqSource))
            {
                reqSource = reqSource.Replace(".", "_").ToLower().Trim();
            }

            var articleElement = new ArticleElementInfo();
            var doc = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//h1[@class=\"news-title\"]", string.Empty);
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//h2[@class=\"news-sapo\"]", string.Empty);
            //author
            articleElement.Author = Utilities._GetNode(doc, "//p[@class=\"news-info\"]", string.Empty);
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@class=\"clearfix news-content\"]", string.Empty);
            //publishTime
            articleElement.PublishedTime = Utilities._GetNode(doc, "//p[@class=\"news-info\"]", ".//time[@class=\"op-published\"]");
            //keyword
            articleElement.Keyword = Utilities._GetNodes(doc, "//div[@class=\"clearfix mgt20 tags\"]//h3//a", string.Empty);
            //relation article

            if (articleElement.Content != null)
            {
                //get large image as thumbnail                    
                articleElement.Image = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "div[@class=\"VCSortableInPreviewMode link-content-footer IMSCurrentEditorEditObject\"]");
            }

            return articleElement;
        }
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            Uri    myUri     = new Uri(articleUrl);
            string reqSource = myUri.Host;

            if (!string.IsNullOrEmpty(reqSource))
            {
                reqSource = reqSource.Replace(".", "_").ToLower().Trim();
            }

            var articleElement = new ArticleElementInfo();
            var doc            = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//h1", string.Empty);
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//div[@class=\"news-desc\"]", string.Empty);
            //author
            articleElement.Author = Utilities._GetNode(doc, "//p[@class=\"undefined\"]", string.Empty);
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@class=\"maincontent\"]", string.Empty);
            //publishTime
            articleElement.PublishedTime = Utilities._GetNode(doc, "//div[@class=\"ns-time\"]", string.Empty);
            //keyword
            articleElement.Keyword = Utilities._GetNodes(doc, "//div[@class=\"box-tag\"]//div[@class=\"bt-content\"]//a", string.Empty);
            //relation article

            if (articleElement.Content != null)
            {
                //get large image as thumbnail
                articleElement.Image   = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"maincontent\"]//div[@style=\"text-align:right\"]");

                if (!articleElement.Image.Attributes["content"].Value.Contains("http"))
                {
                    if (articleElement.Image.Attributes["content"].Value.StartsWith("/"))
                    {
                        articleElement.Image.Attributes["content"].Value =
                            "http://benh.vn" + articleElement.Image.Attributes["content"].Value;
                    }
                    else
                    {
                        articleElement.Image.Attributes["content"].Value =
                            "http://benh.vn/" + articleElement.Image.Attributes["content"].Value;
                    }
                }
            }

            return(articleElement);
        }
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            Uri    myUri     = new Uri(articleUrl);
            string reqSource = myUri.Host;

            if (!string.IsNullOrEmpty(reqSource))
            {
                reqSource = reqSource.Replace(".", "_").ToLower().Trim();
            }

            var articleElement = new ArticleElementInfo();
            var doc            = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//h1[@class=\"title-content\"]", string.Empty);
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//div[@itemtype=\"http://schema.org/WebPageElement\"]//div//strong//p", string.Empty);
            //author
            articleElement.Author = Utilities._GetNode(doc, "//p[@class=\"undefined\"]", string.Empty);
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@itemprop=\"description\"]", string.Empty);
            //publishTime
            articleElement.PublishedTime = Utilities._GetNode(doc, "//time[@class=\"clearfix\"]", string.Empty);
            //keyword
            articleElement.Keyword = Utilities._GetNodes(doc, "//ul[@class=\"td-tags td-post-small-box clearfix\"]//li//a", string.Empty);
            //relation article

            if (articleElement.Content != null)
            {
                //get large image as thumbnail
                articleElement.Image = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]");
                if (!articleElement.Image.Attributes["content"].Value.Contains("http"))
                {
                    if (articleElement.Image.Attributes["content"].Value.StartsWith("/"))
                    {
                        articleElement.Image.Attributes["content"].Value =
                            "http://benh.vn" + articleElement.Image.Attributes["content"].Value;
                    }
                    else
                    {
                        articleElement.Image.Attributes["content"].Value =
                            "http://benh.vn/" + articleElement.Image.Attributes["content"].Value;
                    }
                }
            }

            return(articleElement);
        }
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            Uri    myUri     = new Uri(articleUrl);
            string reqSource = myUri.Host;

            if (!string.IsNullOrEmpty(reqSource))
            {
                reqSource = reqSource.Replace(".", "_").ToLower().Trim();
            }

            var articleElement = new ArticleElementInfo();
            var doc            = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//h1[@class=\"newstitle\"]", string.Empty);
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//h2[@class=\"newsAbtract\"]", string.Empty);
            //author
            articleElement.Author = Utilities._GetNode(doc, "//div[@class=\"author\"]", string.Empty);
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@class=\"content\"]", string.Empty);
            //publishTime
            articleElement.PublishedTime = Utilities._GetNode(doc, "//span[@class=\"newsdate\"]", string.Empty);
            //keyword
            articleElement.Keyword = Utilities._GetNodes(doc, "//h3[@class=\"keyword\"]//a", string.Empty);
            //relation article

            if (articleElement.Content != null)
            {
                //get large image as thumbnail
                articleElement.Image   = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//h1");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//h2");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//div[@style=\"padding-top:10px; padding-bottom: 10px; background-color: #fff;border-bottom: solid 1px silver; border-top: solid 1px silver;\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//iframe");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//h3");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//div[@class=\"faq-listnext10\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//div[@class=\"mtp10\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//table");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//div[@class=\"news-listnext10\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//script");
            }

            return(articleElement);
        }
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            Uri    myUri     = new Uri(articleUrl);
            string reqSource = myUri.Host;

            if (!string.IsNullOrEmpty(reqSource))
            {
                reqSource = reqSource.Replace(".", "_").ToLower().Trim();
            }

            var articleElement = new ArticleElementInfo();
            var doc            = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//h1[@class=\"title\"]", string.Empty);
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//div[@id=\"detail_content\"]//p[1]//span//b", string.Empty);
            //author
            articleElement.Author = Utilities._GetNode(doc, "//div[@id=\"detail_content\"]//p[@style=\"TEXT-ALIGN: right\"]", string.Empty);
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@id=\"detail_content\"]", string.Empty);
            //publishTime
            articleElement.PublishedTime = Utilities._GetNode(doc, "//div[@class=\"bar_video width_common detail_page\"]//p[@class=\"p_time\"]", string.Empty);
            //keyword
            articleElement.Keyword = Utilities._GetNodes(doc, "//div[@class=\"tag_detail width_common\"]//a[@class=\"item_tag\"]", string.Empty);
            //relation article

            if (articleElement.Content != null)
            {
                //get large image as thumbnail
                articleElement.Image   = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"detail_content\"]//p[@style=\"margin:0px 0px;font-family: Helvetica Neue, Helvetica, Arial, sans-serif;font-weight: 500;\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"detail_content\"]//p[@style=\"TEXT-ALIGN: right\"][2]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"detail_content\"]//div[@style=\"text-align:center\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"detail_content\"]//div[@class=\"like_detail_bottom social-skgd\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"detail_content\"]//div[@class=\"tag_detail width_common\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"detail_content\"]//div[@class=\"width_common noborder\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"detail_content\"]//div[@class=\"width_common phantrang\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"detail_content\"]//ins[@class=\"adsbygoogle\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"detail_content\"]//script");
            }

            return(articleElement);
        }
Exemple #6
0
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            Uri    myUri     = new Uri(articleUrl);
            string reqSource = myUri.Host;

            if (!string.IsNullOrEmpty(reqSource))
            {
                reqSource = reqSource.Replace(".", "_").ToLower().Trim();
            }

            var articleElement = new ArticleElementInfo();
            var doc            = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//h1[@class=\"article-title\"]", string.Empty);
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//blockquote[@class=\"article-intro\"]", string.Empty);
            articleElement.Excerpt = Utilities._RemoveNodeForNode(articleElement.Excerpt, "//blockquote[@class=\"article-intro\"]//div");
            //author
            articleElement.Author = Utilities._GetNode(doc, "//p[@style=\"text-align:right\"]//strong", string.Empty);
            //content
            articleElement.Content = Utilities._GetNode(doc, "//section[@class=\"article-content\"]", string.Empty);
            //publishTime
            articleElement.PublishedTime = Utilities._GetNode(doc, "//p[@class=\"time \"]", string.Empty);
            //keyword
            articleElement.Keyword = Utilities._GetNodes(doc, "//div[@class=\"tags\"]//span//a", string.Empty);
            //relation article

            if (articleElement.Content != null)
            {
                //get large image as thumbnail
                articleElement.Image   = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//section[@class=\"article-content\"]//ul[@class=\"ul_relate\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//section[@class=\"article-content\"]//div[@id=\"bs-inread-container\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//section[@class=\"article-content\"]//div[@id=\"bsinread\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//section[@class=\"article-content\"]//div[@class=\"bysource\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//section[@class=\"article-content\"]//div[@class=\"tags\"]");
            }

            return(articleElement);
        }
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            Uri    myUri     = new Uri(articleUrl);
            string reqSource = myUri.Host;

            if (!string.IsNullOrEmpty(reqSource))
            {
                reqSource = reqSource.Replace(".", "_").ToLower().Trim();
            }

            var articleElement = new ArticleElementInfo();
            var doc            = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//h1[@class=\"entry-title\"]", string.Empty);
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//h4[@style=\"text-align: justify;\"]//strong", string.Empty);
            //author
            articleElement.Author = Utilities._GetNode(doc, "//p[@style=\"text-align: right;\"]//em", string.Empty);
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@class=\"td-post-content td-pb-padding-side\"]", string.Empty);
            //publishTime
            articleElement.PublishedTime = Utilities._GetNode(doc, "//div[@class=\"time_detail_news\"]", string.Empty);
            //keyword
            articleElement.Keyword = Utilities._GetNodes(doc, "//ul[@class=\"td-tags td-post-small-box clearfix\"]//li//a", string.Empty);
            //relation article

            if (articleElement.Content != null)
            {
                //get large image as thumbnail
                articleElement.Image   = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"td-post-content td-pb-padding-side\"]//div");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"td-post-content td-pb-padding-side\"]//h4");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"td-post-content td-pb-padding-side\"]//p[@style=\"text-align: right;\"]//em");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"td-post-content td-pb-padding-side\"]//iframe");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"td-post-content td-pb-padding-side\"]//script");
            }

            return(articleElement);
        }
        public ArticleElementInfo GetArticleElement(string articleUrl)
        {
            Uri    myUri     = new Uri(articleUrl);
            string reqSource = myUri.Host;

            if (!string.IsNullOrEmpty(reqSource))
            {
                reqSource = reqSource.Replace(".", "_").ToLower().Trim();
            }

            var articleElement = new ArticleElementInfo();
            var doc            = Utilities.GetHtmlDocument(articleUrl);

            //title
            articleElement.Title = Utilities._GetNode(doc, "//h1//div[@class=\"title_detail_news\"]", string.Empty);
            //excerpt
            articleElement.Excerpt = Utilities._GetNode(doc, "//div[@class=\"sapo_detail fr fontSlabB\"]", string.Empty);
            articleElement.Excerpt = Utilities._RemoveNodeForNode(articleElement.Excerpt, "//div[@class=\"sapo_detail fr fontSlabB\"]//div");
            //author
            articleElement.Author = Utilities._GetNode(doc, "//div[@class=\"author_undefined\"]", string.Empty);
            //content
            articleElement.Content = Utilities._GetNode(doc, "//div[@id=\"cotent_detail\"]", string.Empty);
            //publishTime
            articleElement.PublishedTime = Utilities._GetNode(doc, "//div[@class=\"time_detail_news f11\"]", string.Empty);
            //keyword
            articleElement.Keyword = Utilities._GetNodes(doc, "//div[@class=\"tag_detail mar_bottom15\"]//a", string.Empty);
            //relation article

            if (articleElement.Content != null)
            {
                //get large image as thumbnail
                articleElement.Image   = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"cotent_detail\"]//ins");
                articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"cotent_detail\"]//script");
            }

            return(articleElement);
        }
Exemple #9
0
        /// <summary>
        /// get detail article
        /// </summary>
        /// <param name="articleUrl"></param>
        /// <returns></returns>
        public RemoteArticleInfo GetRemoteArticleInfo(string articleUrl)
        {
            SiteDetect(articleUrl);

            // get article htmlnode
            ArticleElementInfo articleElement = Site?.GetArticleElement(articleUrl);

            if (articleElement == null)
            {
                return(null);
            }

            if (articleElement.Content != null)
            {
                articleElement.Content = Utilities._ClearHtmlTag(articleElement.Content);
            }

            var remoteArticleInfo = new RemoteArticleInfo();
            //set content
            //domain
            Uri uri = new Uri(articleUrl);

            remoteArticleInfo.domain = uri.Host;
            //url
            remoteArticleInfo.url = articleUrl;
            //title
            remoteArticleInfo.title = articleElement.Title != null?HttpUtility.HtmlDecode(articleElement.Title.InnerText.Trim()) : string.Empty;

            //_excerpt
            if (articleElement.Excerpt != null)
            {
                remoteArticleInfo.excerpt = string.IsNullOrEmpty(HttpUtility.HtmlDecode(articleElement.Excerpt.InnerText.Trim())) ? articleElement.Excerpt.Attributes["content"].Value.Trim() : HttpUtility.HtmlDecode(articleElement.Excerpt.InnerText.Trim());
            }
            else
            {
                remoteArticleInfo.excerpt = string.Empty;
            }
            //content
            remoteArticleInfo.content = articleElement.Content != null?HttpUtility.HtmlDecode(articleElement.Content.InnerHtml) : string.Empty;

            //author
            remoteArticleInfo.author = articleElement.Author != null?HttpUtility.HtmlDecode(articleElement.Author.InnerText.Trim()) : string.Empty;

            //_image
            if (articleElement.Image != null)
            {
                remoteArticleInfo.lead_image_url = articleElement.Image.Attributes["src"]?.Value.Trim() ?? articleElement.Image.Attributes["content"].Value.Trim();
                if (remoteArticleInfo.lead_image_url == null)
                {
                    return(null);
                }
            }

            //publish time
            remoteArticleInfo.date_published = articleElement.PublishedTime != null?HttpUtility.HtmlDecode(articleElement.PublishedTime.InnerText.Trim()) : string.Empty;

            //keyword
            if (articleElement.Keyword != null)
            {
                for (int i = 0; i < articleElement.Keyword.Count &&
                     (remoteArticleInfo.keyword + articleElement.Keyword[i].InnerText.Trim().Replace("#", "")).Length < 255; i++)
                {
                    string xxx = HttpUtility.HtmlDecode(articleElement.Keyword[i].InnerText.Trim().Replace("#", "").Replace("&2", "&#2"));
                    remoteArticleInfo.keyword += xxx;
                    if (i != articleElement.Keyword.Count - 1)
                    {
                        remoteArticleInfo.keyword += ',';
                    }
                }
            }
            else
            {
                remoteArticleInfo.keyword = string.Empty;
            }

            //replace
            remoteArticleInfo.content = Utilities._ClearContent(remoteArticleInfo.content);
            var replaceObj = Site.GetStringToReplace();

            if (replaceObj != null)
            {
                foreach (var obj in replaceObj)
                {
                    remoteArticleInfo.content = remoteArticleInfo.content.Replace(obj.ToString(), string.Empty);
                }
            }

            return(remoteArticleInfo);
        }