//hàm lấy content của một article public static async Task LoadContentFrom(NewsItem item, string url) { if (!url.Contains("http://")) { url = "http://dantri.com.vn" + url; } string html = await HtmlDownloader.loadFromUrl(url); HtmlDocument page = new HtmlDocument(); page.LoadHtml(html); HtmlNode nodeContent = page.DocumentNode.SelectSingleNode("//div[@class='fon34 mt3 mr2 fon43']"); HtmlNode nodeTag = page.DocumentNode.SelectSingleNode("//div[@class='news-tag']"); if (nodeTag != null) { var allNodeInTag = nodeTag.SelectNodes(nodeTag.XPath + "//a[@href]"); if (allNodeInTag != null) { foreach (HtmlNode node in allNodeInTag) { ItemTag tag = new ItemTag() { Title = HtmlDownloader.removeHtml(node.InnerText), Link = node.Attributes["href"].Value }; item.addToTagList(tag); tag = null; } } } //disable all link var allNodeInContent = nodeContent.SelectNodes(nodeContent.XPath + "//*[@href]"); if (allNodeInContent != null) { foreach (HtmlNode node in allNodeInContent) { node.SetAttributeValue("href", ""); //remove reference link } } int positionToDel = nodeContent.InnerHtml.IndexOf("<div class=\"news-tag\">"); if (positionToDel > 0) { nodeContent.InnerHtml = nodeContent.InnerHtml.Substring(0, positionToDel); } HtmlNode nodeTime = page.DocumentNode.SelectSingleNode("//span[@class='fr fon7 mr2']"); item.DatePublished = HtmlDownloader.removeHtml(nodeTime.InnerText); ptichDate(item); item.Content = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head>"; item.Content += "<p>" + item.DatePublished + "</p>"; item.Content += "<p><b>Tóm tắt nội dung:</b> <i>" + item.ShortContent + "</i></p>"; item.Content += nodeContent.InnerHtml.Replace(nodeTag.OuterHtml, ""); item.Content = WebUtility.HtmlEncode(item.Content); }
//load content from ngoisao.net public async static Task LoadFromNgoiSao(HtmlDocument page, NewsItem item) { //load time HtmlNode nodeTime = page.DocumentNode.SelectSingleNode("//span[@class='spanDateTime fl']"); if (nodeTime != null) { item.DatePublished = HtmlDownloader.removeHtml(nodeTime.InnerText); ptichDate(item); } //load item tag HtmlNode nodeTag = page.DocumentNode.SelectSingleNode("//div[@class='wordTag']"); if (nodeTag != null) { var allNodeInTag = nodeTag.SelectNodes(nodeTag.XPath + "//a[@href]"); if (allNodeInTag != null) { foreach (HtmlNode node in allNodeInTag) { ItemTag tag = new ItemTag() { Title = HtmlDownloader.removeHtml(node.InnerText), Link = node.Attributes["href"].Value }; item.addToTagList(tag); tag = null; } } } //load item succient content HtmlNode nodeDescription = page.DocumentNode.SelectSingleNode("//h2[@class='lead']"); if (nodeDescription != null) { item.ShortContent = HtmlDownloader.removeHtml(nodeDescription.InnerText); } HtmlNode nodeId = page.DocumentNode.SelectSingleNode("//meta[@name='tt_article_id']"); string id = nodeId.Attributes["content"].Value; string html = await HtmlDownloader.loadFromUrl("http://ngoisao.net/detail/print?id=" + id); HtmlDocument printPage = new HtmlDocument(); printPage.LoadHtml(html); //load page content HtmlNode nodeContent = printPage.DocumentNode.SelectSingleNode("//div[@class='fck_detail']"); var allNodeInContent = nodeContent.SelectNodes(nodeContent.XPath + "//*[@href]"); if (allNodeInContent != null) { foreach (HtmlNode node in allNodeInContent) { node.SetAttributeValue("href", ""); //remove reference link } } item.Content = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head>"; item.Content += "<p>" + item.DatePublished + "</p>"; item.Content += "<p><b>Tóm tắt nội dung:</b> <i>" + item.ShortContent + "</i></p>"; item.Content += nodeContent.InnerHtml; item.Content = WebUtility.HtmlEncode(item.Content); }
public async static Task LoadContentFrom(NewsItem item, string url) { if (!url.Contains("http://")) { url = "http://vnexpress.net" + url; } string html = await HtmlDownloader.loadFromUrl(url); HtmlDocument page = new HtmlDocument(); page.LoadHtml(html); if (url.Contains("ione.vnexpress")) { LoadFromIOne(page, item); } else if (url.Contains("ngoisao.net")) { await LoadFromNgoiSao(page, item); } else { HtmlNode nodeTag; if (url.Contains("http://vnexpress.net")) { nodeTag = page.DocumentNode.SelectSingleNode("//div[@class='tag-pos']"); } else { nodeTag = page.DocumentNode.SelectSingleNode("//div[@class='content_tagbar']"); } if (nodeTag != null) { var allNodeInTag = nodeTag.SelectNodes(nodeTag.XPath + "//a[@href]"); if (allNodeInTag != null) { foreach (HtmlNode node in allNodeInTag) { ItemTag tag = new ItemTag() { Title = HtmlDownloader.removeHtml(node.InnerText), Link = node.Attributes["href"].Value }; item.addToTagList(tag); tag = null; } } } if (url.Contains("http://vnexpress.net")) { HtmlNode nodeDescription = page.DocumentNode.SelectSingleNode("//h2[@class='Lead']"); if (nodeDescription != null) { item.ShortContent = HtmlDownloader.removeHtml(nodeDescription.InnerText); } HtmlNode nodeTime = page.DocumentNode.SelectSingleNode("//span[@class='spanTime']"); if (nodeTime != null) { item.DatePublished = HtmlDownloader.removeHtml(nodeTime.InnerText); ptichDate(item); } } else { HtmlNode nodeDescription = page.DocumentNode.SelectSingleNode("//div[@class='short_intro']"); item.ShortContent = HtmlDownloader.removeHtml(nodeDescription.InnerText); HtmlNode nodeTime = page.DocumentNode.SelectSingleNode("//div[@class='time txt_666 left txt_11']"); if (nodeTime != null) { item.DatePublished = HtmlDownloader.removeHtml(nodeTime.InnerText); ptichDate(item); } } HtmlNode nodeContent = page.DocumentNode.SelectSingleNode("//div[@class='fck_detail']"); //disable all link if (nodeContent != null) { var allNodeInContent = nodeContent.SelectNodes(nodeContent.XPath + "//*[@href]"); if (allNodeInContent != null) { foreach (HtmlNode node in allNodeInContent) { node.SetAttributeValue("href", ""); //remove reference link } } item.Content = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head>"; item.Content += "<p>" + item.DatePublished + "</p>"; item.Content += "<p><b>Tóm tắt nội dung:</b> <i>" + item.ShortContent + "</i></p>"; item.Content += nodeContent.InnerHtml; item.Content = WebUtility.HtmlEncode(item.Content); } } }
//load content from ione.vnexpress.net private static void LoadFromIOne(HtmlDocument page, NewsItem item) { //load time HtmlNode nodeTime = page.DocumentNode.SelectSingleNode("//div[@class='time left']"); if (nodeTime != null) { string date = HtmlDownloader.removeHtml(nodeTime.InnerText); item.DatePublished = date; date = date.Replace("AM", ""); date = date.Replace("PM", ""); string[] time = date.Split('|'); if (time.Length == 2) { time[0] = HtmlDownloader.removeHtml(time[0]); time[1] = HtmlDownloader.removeHtml(time[1]); string[] day = time[1].Split('/'); if (day.Length == 3) { if (day[1].Length == 1) { day[1] = "0" + day[1]; } if (day[0].Length == 1) { day[0] = "0" + day[1]; } time[1] = day[2] + day[1] + day[0]; } item.DateStandard = time[1] + time[0]; } } //load item tag HtmlNode nodeTag = page.DocumentNode.SelectSingleNode("//div[@class='left w600 list_tags']"); var allNodeInTag = nodeTag.SelectNodes(nodeTag.XPath + "//a[@href]"); if (allNodeInTag != null) { foreach (HtmlNode node in allNodeInTag) { ItemTag tag = new ItemTag() { Title = HtmlDownloader.removeHtml(node.InnerText), Link = node.Attributes["href"].Value }; item.addToTagList(tag); tag = null; } } //load item succient content HtmlNode nodeDescription = page.DocumentNode.SelectSingleNode("//div[@class='lead']"); item.ShortContent = HtmlDownloader.removeHtml(nodeDescription.InnerText); //load item content HtmlNode nodeContent = page.DocumentNode.SelectSingleNode("//div[@class='content']"); var allNodeInContent = nodeContent.SelectNodes(nodeContent.XPath + "//*[@href]"); if (allNodeInContent != null) { foreach (HtmlNode node in allNodeInContent) { node.SetAttributeValue("href", ""); //remove reference link } } item.Content = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head>"; item.Content += "<p>" + item.DatePublished + "</p>"; item.Content += "<p><b>Tóm tắt nội dung:</b> <i>" + item.ShortContent + "</i></p>"; item.Content += nodeContent.InnerHtml; item.Content = WebUtility.HtmlEncode(item.Content); }