public ArticleElementInfo GetArticleElement(string articleUrl) { Uri myUri = new Uri(articleUrl); string reqSource = myUri.Host; if (!string.IsNullOrEmpty(reqSource)) { reqSource = reqSource.Replace(".", "_").ToLower().Trim(); } var articleElement = new ArticleElementInfo(); var doc = Utilities.GetHtmlDocument(articleUrl); //title articleElement.Title = Utilities._GetNode(doc, "//h1[@class=\"news-title\"]", string.Empty); //excerpt articleElement.Excerpt = Utilities._GetNode(doc, "//h2[@class=\"news-sapo\"]", string.Empty); //author articleElement.Author = Utilities._GetNode(doc, "//p[@class=\"news-info\"]", string.Empty); //content articleElement.Content = Utilities._GetNode(doc, "//div[@class=\"clearfix news-content\"]", string.Empty); //publishTime articleElement.PublishedTime = Utilities._GetNode(doc, "//p[@class=\"news-info\"]", ".//time[@class=\"op-published\"]"); //keyword articleElement.Keyword = Utilities._GetNodes(doc, "//div[@class=\"clearfix mgt20 tags\"]//h3//a", string.Empty); //relation article if (articleElement.Content != null) { //get large image as thumbnail articleElement.Image = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "div[@class=\"VCSortableInPreviewMode link-content-footer IMSCurrentEditorEditObject\"]"); } return articleElement; }
public ArticleElementInfo GetArticleElement(string articleUrl) { Uri myUri = new Uri(articleUrl); string reqSource = myUri.Host; if (!string.IsNullOrEmpty(reqSource)) { reqSource = reqSource.Replace(".", "_").ToLower().Trim(); } var articleElement = new ArticleElementInfo(); var doc = Utilities.GetHtmlDocument(articleUrl); //title articleElement.Title = Utilities._GetNode(doc, "//h1", string.Empty); //excerpt articleElement.Excerpt = Utilities._GetNode(doc, "//div[@class=\"news-desc\"]", string.Empty); //author articleElement.Author = Utilities._GetNode(doc, "//p[@class=\"undefined\"]", string.Empty); //content articleElement.Content = Utilities._GetNode(doc, "//div[@class=\"maincontent\"]", string.Empty); //publishTime articleElement.PublishedTime = Utilities._GetNode(doc, "//div[@class=\"ns-time\"]", string.Empty); //keyword articleElement.Keyword = Utilities._GetNodes(doc, "//div[@class=\"box-tag\"]//div[@class=\"bt-content\"]//a", string.Empty); //relation article if (articleElement.Content != null) { //get large image as thumbnail articleElement.Image = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"maincontent\"]//div[@style=\"text-align:right\"]"); if (!articleElement.Image.Attributes["content"].Value.Contains("http")) { if (articleElement.Image.Attributes["content"].Value.StartsWith("/")) { articleElement.Image.Attributes["content"].Value = "http://benh.vn" + articleElement.Image.Attributes["content"].Value; } else { articleElement.Image.Attributes["content"].Value = "http://benh.vn/" + articleElement.Image.Attributes["content"].Value; } } } return(articleElement); }
public ArticleElementInfo GetArticleElement(string articleUrl) { Uri myUri = new Uri(articleUrl); string reqSource = myUri.Host; if (!string.IsNullOrEmpty(reqSource)) { reqSource = reqSource.Replace(".", "_").ToLower().Trim(); } var articleElement = new ArticleElementInfo(); var doc = Utilities.GetHtmlDocument(articleUrl); //title articleElement.Title = Utilities._GetNode(doc, "//h1[@class=\"title-content\"]", string.Empty); //excerpt articleElement.Excerpt = Utilities._GetNode(doc, "//div[@itemtype=\"http://schema.org/WebPageElement\"]//div//strong//p", string.Empty); //author articleElement.Author = Utilities._GetNode(doc, "//p[@class=\"undefined\"]", string.Empty); //content articleElement.Content = Utilities._GetNode(doc, "//div[@itemprop=\"description\"]", string.Empty); //publishTime articleElement.PublishedTime = Utilities._GetNode(doc, "//time[@class=\"clearfix\"]", string.Empty); //keyword articleElement.Keyword = Utilities._GetNodes(doc, "//ul[@class=\"td-tags td-post-small-box clearfix\"]//li//a", string.Empty); //relation article if (articleElement.Content != null) { //get large image as thumbnail articleElement.Image = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]"); if (!articleElement.Image.Attributes["content"].Value.Contains("http")) { if (articleElement.Image.Attributes["content"].Value.StartsWith("/")) { articleElement.Image.Attributes["content"].Value = "http://benh.vn" + articleElement.Image.Attributes["content"].Value; } else { articleElement.Image.Attributes["content"].Value = "http://benh.vn/" + articleElement.Image.Attributes["content"].Value; } } } return(articleElement); }
public ArticleElementInfo GetArticleElement(string articleUrl) { Uri myUri = new Uri(articleUrl); string reqSource = myUri.Host; if (!string.IsNullOrEmpty(reqSource)) { reqSource = reqSource.Replace(".", "_").ToLower().Trim(); } var articleElement = new ArticleElementInfo(); var doc = Utilities.GetHtmlDocument(articleUrl); //title articleElement.Title = Utilities._GetNode(doc, "//h1[@class=\"newstitle\"]", string.Empty); //excerpt articleElement.Excerpt = Utilities._GetNode(doc, "//h2[@class=\"newsAbtract\"]", string.Empty); //author articleElement.Author = Utilities._GetNode(doc, "//div[@class=\"author\"]", string.Empty); //content articleElement.Content = Utilities._GetNode(doc, "//div[@class=\"content\"]", string.Empty); //publishTime articleElement.PublishedTime = Utilities._GetNode(doc, "//span[@class=\"newsdate\"]", string.Empty); //keyword articleElement.Keyword = Utilities._GetNodes(doc, "//h3[@class=\"keyword\"]//a", string.Empty); //relation article if (articleElement.Content != null) { //get large image as thumbnail articleElement.Image = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//h1"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//h2"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//div[@style=\"padding-top:10px; padding-bottom: 10px; background-color: #fff;border-bottom: solid 1px silver; border-top: solid 1px silver;\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//iframe"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//h3"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//div[@class=\"faq-listnext10\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//div[@class=\"mtp10\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//table"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//div[@class=\"news-listnext10\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"content\"]//script"); } return(articleElement); }
public ArticleElementInfo GetArticleElement(string articleUrl) { Uri myUri = new Uri(articleUrl); string reqSource = myUri.Host; if (!string.IsNullOrEmpty(reqSource)) { reqSource = reqSource.Replace(".", "_").ToLower().Trim(); } var articleElement = new ArticleElementInfo(); var doc = Utilities.GetHtmlDocument(articleUrl); //title articleElement.Title = Utilities._GetNode(doc, "//h1[@class=\"title\"]", string.Empty); //excerpt articleElement.Excerpt = Utilities._GetNode(doc, "//div[@id=\"detail_content\"]//p[1]//span//b", string.Empty); //author articleElement.Author = Utilities._GetNode(doc, "//div[@id=\"detail_content\"]//p[@style=\"TEXT-ALIGN: right\"]", string.Empty); //content articleElement.Content = Utilities._GetNode(doc, "//div[@id=\"detail_content\"]", string.Empty); //publishTime articleElement.PublishedTime = Utilities._GetNode(doc, "//div[@class=\"bar_video width_common detail_page\"]//p[@class=\"p_time\"]", string.Empty); //keyword articleElement.Keyword = Utilities._GetNodes(doc, "//div[@class=\"tag_detail width_common\"]//a[@class=\"item_tag\"]", string.Empty); //relation article if (articleElement.Content != null) { //get large image as thumbnail articleElement.Image = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"detail_content\"]//p[@style=\"margin:0px 0px;font-family: Helvetica Neue, Helvetica, Arial, sans-serif;font-weight: 500;\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"detail_content\"]//p[@style=\"TEXT-ALIGN: right\"][2]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"detail_content\"]//div[@style=\"text-align:center\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"detail_content\"]//div[@class=\"like_detail_bottom social-skgd\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"detail_content\"]//div[@class=\"tag_detail width_common\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"detail_content\"]//div[@class=\"width_common noborder\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"detail_content\"]//div[@class=\"width_common phantrang\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"detail_content\"]//ins[@class=\"adsbygoogle\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"detail_content\"]//script"); } return(articleElement); }
public ArticleElementInfo GetArticleElement(string articleUrl) { Uri myUri = new Uri(articleUrl); string reqSource = myUri.Host; if (!string.IsNullOrEmpty(reqSource)) { reqSource = reqSource.Replace(".", "_").ToLower().Trim(); } var articleElement = new ArticleElementInfo(); var doc = Utilities.GetHtmlDocument(articleUrl); //title articleElement.Title = Utilities._GetNode(doc, "//h1[@class=\"article-title\"]", string.Empty); //excerpt articleElement.Excerpt = Utilities._GetNode(doc, "//blockquote[@class=\"article-intro\"]", string.Empty); articleElement.Excerpt = Utilities._RemoveNodeForNode(articleElement.Excerpt, "//blockquote[@class=\"article-intro\"]//div"); //author articleElement.Author = Utilities._GetNode(doc, "//p[@style=\"text-align:right\"]//strong", string.Empty); //content articleElement.Content = Utilities._GetNode(doc, "//section[@class=\"article-content\"]", string.Empty); //publishTime articleElement.PublishedTime = Utilities._GetNode(doc, "//p[@class=\"time \"]", string.Empty); //keyword articleElement.Keyword = Utilities._GetNodes(doc, "//div[@class=\"tags\"]//span//a", string.Empty); //relation article if (articleElement.Content != null) { //get large image as thumbnail articleElement.Image = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//section[@class=\"article-content\"]//ul[@class=\"ul_relate\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//section[@class=\"article-content\"]//div[@id=\"bs-inread-container\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//section[@class=\"article-content\"]//div[@id=\"bsinread\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//section[@class=\"article-content\"]//div[@class=\"bysource\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//section[@class=\"article-content\"]//div[@class=\"tags\"]"); } return(articleElement); }
public ArticleElementInfo GetArticleElement(string articleUrl) { Uri myUri = new Uri(articleUrl); string reqSource = myUri.Host; if (!string.IsNullOrEmpty(reqSource)) { reqSource = reqSource.Replace(".", "_").ToLower().Trim(); } var articleElement = new ArticleElementInfo(); var doc = Utilities.GetHtmlDocument(articleUrl); //title articleElement.Title = Utilities._GetNode(doc, "//h1[@class=\"entry-title\"]", string.Empty); //excerpt articleElement.Excerpt = Utilities._GetNode(doc, "//h4[@style=\"text-align: justify;\"]//strong", string.Empty); //author articleElement.Author = Utilities._GetNode(doc, "//p[@style=\"text-align: right;\"]//em", string.Empty); //content articleElement.Content = Utilities._GetNode(doc, "//div[@class=\"td-post-content td-pb-padding-side\"]", string.Empty); //publishTime articleElement.PublishedTime = Utilities._GetNode(doc, "//div[@class=\"time_detail_news\"]", string.Empty); //keyword articleElement.Keyword = Utilities._GetNodes(doc, "//ul[@class=\"td-tags td-post-small-box clearfix\"]//li//a", string.Empty); //relation article if (articleElement.Content != null) { //get large image as thumbnail articleElement.Image = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"td-post-content td-pb-padding-side\"]//div"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"td-post-content td-pb-padding-side\"]//h4"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"td-post-content td-pb-padding-side\"]//p[@style=\"text-align: right;\"]//em"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"td-post-content td-pb-padding-side\"]//iframe"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@class=\"td-post-content td-pb-padding-side\"]//script"); } return(articleElement); }
public ArticleElementInfo GetArticleElement(string articleUrl) { Uri myUri = new Uri(articleUrl); string reqSource = myUri.Host; if (!string.IsNullOrEmpty(reqSource)) { reqSource = reqSource.Replace(".", "_").ToLower().Trim(); } var articleElement = new ArticleElementInfo(); var doc = Utilities.GetHtmlDocument(articleUrl); //title articleElement.Title = Utilities._GetNode(doc, "//h1//div[@class=\"title_detail_news\"]", string.Empty); //excerpt articleElement.Excerpt = Utilities._GetNode(doc, "//div[@class=\"sapo_detail fr fontSlabB\"]", string.Empty); articleElement.Excerpt = Utilities._RemoveNodeForNode(articleElement.Excerpt, "//div[@class=\"sapo_detail fr fontSlabB\"]//div"); //author articleElement.Author = Utilities._GetNode(doc, "//div[@class=\"author_undefined\"]", string.Empty); //content articleElement.Content = Utilities._GetNode(doc, "//div[@id=\"cotent_detail\"]", string.Empty); //publishTime articleElement.PublishedTime = Utilities._GetNode(doc, "//div[@class=\"time_detail_news f11\"]", string.Empty); //keyword articleElement.Keyword = Utilities._GetNodes(doc, "//div[@class=\"tag_detail mar_bottom15\"]//a", string.Empty); //relation article if (articleElement.Content != null) { //get large image as thumbnail articleElement.Image = articleElement.Content.SelectSingleNode("//meta[@property=\"og:image\"]"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"cotent_detail\"]//ins"); articleElement.Content = Utilities._RemoveNodeForNode(articleElement.Content, "//div[@id=\"cotent_detail\"]//script"); } return(articleElement); }
/// <summary> /// get detail article /// </summary> /// <param name="articleUrl"></param> /// <returns></returns> public RemoteArticleInfo GetRemoteArticleInfo(string articleUrl) { SiteDetect(articleUrl); // get article htmlnode ArticleElementInfo articleElement = Site?.GetArticleElement(articleUrl); if (articleElement == null) { return(null); } if (articleElement.Content != null) { articleElement.Content = Utilities._ClearHtmlTag(articleElement.Content); } var remoteArticleInfo = new RemoteArticleInfo(); //set content //domain Uri uri = new Uri(articleUrl); remoteArticleInfo.domain = uri.Host; //url remoteArticleInfo.url = articleUrl; //title remoteArticleInfo.title = articleElement.Title != null?HttpUtility.HtmlDecode(articleElement.Title.InnerText.Trim()) : string.Empty; //_excerpt if (articleElement.Excerpt != null) { remoteArticleInfo.excerpt = string.IsNullOrEmpty(HttpUtility.HtmlDecode(articleElement.Excerpt.InnerText.Trim())) ? articleElement.Excerpt.Attributes["content"].Value.Trim() : HttpUtility.HtmlDecode(articleElement.Excerpt.InnerText.Trim()); } else { remoteArticleInfo.excerpt = string.Empty; } //content remoteArticleInfo.content = articleElement.Content != null?HttpUtility.HtmlDecode(articleElement.Content.InnerHtml) : string.Empty; //author remoteArticleInfo.author = articleElement.Author != null?HttpUtility.HtmlDecode(articleElement.Author.InnerText.Trim()) : string.Empty; //_image if (articleElement.Image != null) { remoteArticleInfo.lead_image_url = articleElement.Image.Attributes["src"]?.Value.Trim() ?? articleElement.Image.Attributes["content"].Value.Trim(); if (remoteArticleInfo.lead_image_url == null) { return(null); } } //publish time remoteArticleInfo.date_published = articleElement.PublishedTime != null?HttpUtility.HtmlDecode(articleElement.PublishedTime.InnerText.Trim()) : string.Empty; //keyword if (articleElement.Keyword != null) { for (int i = 0; i < articleElement.Keyword.Count && (remoteArticleInfo.keyword + articleElement.Keyword[i].InnerText.Trim().Replace("#", "")).Length < 255; i++) { string xxx = HttpUtility.HtmlDecode(articleElement.Keyword[i].InnerText.Trim().Replace("#", "").Replace("&2", "")); remoteArticleInfo.keyword += xxx; if (i != articleElement.Keyword.Count - 1) { remoteArticleInfo.keyword += ','; } } } else { remoteArticleInfo.keyword = string.Empty; } //replace remoteArticleInfo.content = Utilities._ClearContent(remoteArticleInfo.content); var replaceObj = Site.GetStringToReplace(); if (replaceObj != null) { foreach (var obj in replaceObj) { remoteArticleInfo.content = remoteArticleInfo.content.Replace(obj.ToString(), string.Empty); } } return(remoteArticleInfo); }