//hàm lấy content của một article public static async Task LoadContentFrom(NewsItem item, string url) { if (!url.Contains("http://")) { url = "http://dantri.com.vn" + url; } string html = await HtmlDownloader.loadFromUrl(url); HtmlDocument page = new HtmlDocument(); page.LoadHtml(html); HtmlNode nodeContent = page.DocumentNode.SelectSingleNode("//div[@class='fon34 mt3 mr2 fon43']"); HtmlNode nodeTag = page.DocumentNode.SelectSingleNode("//div[@class='news-tag']"); if (nodeTag != null) { var allNodeInTag = nodeTag.SelectNodes(nodeTag.XPath + "//a[@href]"); if (allNodeInTag != null) { foreach (HtmlNode node in allNodeInTag) { ItemTag tag = new ItemTag() { Title = HtmlDownloader.removeHtml(node.InnerText), Link = node.Attributes["href"].Value }; item.addToTagList(tag); tag = null; } } } //disable all link var allNodeInContent = nodeContent.SelectNodes(nodeContent.XPath + "//*[@href]"); if (allNodeInContent != null) { foreach (HtmlNode node in allNodeInContent) { node.SetAttributeValue("href", ""); //remove reference link } } int positionToDel = nodeContent.InnerHtml.IndexOf("<div class=\"news-tag\">"); if (positionToDel > 0) { nodeContent.InnerHtml = nodeContent.InnerHtml.Substring(0, positionToDel); } HtmlNode nodeTime = page.DocumentNode.SelectSingleNode("//span[@class='fr fon7 mr2']"); item.DatePublished = HtmlDownloader.removeHtml(nodeTime.InnerText); ptichDate(item); item.Content = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head>"; item.Content += "<p>" + item.DatePublished + "</p>"; item.Content += "<p><b>Tóm tắt nội dung:</b> <i>" + item.ShortContent + "</i></p>"; item.Content += nodeContent.InnerHtml.Replace(nodeTag.OuterHtml, ""); item.Content = WebUtility.HtmlEncode(item.Content); }
public static async Task DoSearch(NewsItemList plist, string keyword) { try { plist.Clear(); string url = "http://timkiem.vnexpress.net/?q=" + keyword; string html = await HtmlDownloader.loadFromUrl(url); HtmlDocument page = new HtmlDocument(); page.LoadHtml(html); var allResultNodes = page.DocumentNode.SelectNodes("//li[@class='block_search_result_text']"); if (allResultNodes == null) { MessageBox.Show("Sorry, no articles found!"); return; } foreach (HtmlNode itemNode in allResultNodes) { NewsItem article = new NewsItem(); article.Source = "Vnexpress"; HtmlNode nodeLink = itemNode.SelectSingleNode(itemNode.XPath + "//a[@href]"); HtmlNode nodeImage = itemNode.SelectSingleNode(itemNode.XPath + "//img[@src]"); if (nodeImage != null) { article.ImageLink = nodeImage.Attributes["src"].Value; } article.Name = HtmlDownloader.removeHtml(nodeLink.Attributes["alt"].Value); article.LinkUrl = nodeLink.Attributes["href"].Value; HtmlNode nodeTime = itemNode.SelectSingleNode(itemNode.XPath + "//p[@class='txt_gray txt_11 ex_hi']"); article.DatePublished = HtmlDownloader.removeHtml(nodeTime.InnerHtml); HtmlNode nodeShortContent = itemNode.SelectSingleNode(itemNode.XPath + "//span[@class='hightlight']"); article.ShortContent = HtmlDownloader.removeHtml(nodeShortContent.InnerText); plist.Add(article); } if (plist.Count == 0) { MessageBox.Show("Sorry, no articles found!"); } } catch (Exception ex) { MessageBox.Show("System got an error at DoSearch function with message:\n" + ex.Message); return; } }
//load content from ngoisao.net public async static Task LoadFromNgoiSao(HtmlDocument page, NewsItem item) { //load time HtmlNode nodeTime = page.DocumentNode.SelectSingleNode("//span[@class='spanDateTime fl']"); if (nodeTime != null) { item.DatePublished = HtmlDownloader.removeHtml(nodeTime.InnerText); ptichDate(item); } //load item tag HtmlNode nodeTag = page.DocumentNode.SelectSingleNode("//div[@class='wordTag']"); if (nodeTag != null) { var allNodeInTag = nodeTag.SelectNodes(nodeTag.XPath + "//a[@href]"); if (allNodeInTag != null) { foreach (HtmlNode node in allNodeInTag) { ItemTag tag = new ItemTag() { Title = HtmlDownloader.removeHtml(node.InnerText), Link = node.Attributes["href"].Value }; item.addToTagList(tag); tag = null; } } } //load item succient content HtmlNode nodeDescription = page.DocumentNode.SelectSingleNode("//h2[@class='lead']"); if (nodeDescription != null) { item.ShortContent = HtmlDownloader.removeHtml(nodeDescription.InnerText); } HtmlNode nodeId = page.DocumentNode.SelectSingleNode("//meta[@name='tt_article_id']"); string id = nodeId.Attributes["content"].Value; string html = await HtmlDownloader.loadFromUrl("http://ngoisao.net/detail/print?id=" + id); HtmlDocument printPage = new HtmlDocument(); printPage.LoadHtml(html); //load page content HtmlNode nodeContent = printPage.DocumentNode.SelectSingleNode("//div[@class='fck_detail']"); var allNodeInContent = nodeContent.SelectNodes(nodeContent.XPath + "//*[@href]"); if (allNodeInContent != null) { foreach (HtmlNode node in allNodeInContent) { node.SetAttributeValue("href", ""); //remove reference link } } item.Content = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head>"; item.Content += "<p>" + item.DatePublished + "</p>"; item.Content += "<p><b>Tóm tắt nội dung:</b> <i>" + item.ShortContent + "</i></p>"; item.Content += nodeContent.InnerHtml; item.Content = WebUtility.HtmlEncode(item.Content); }
//load articles from source, category, url public static async Task LoadItemsFromPage(NewsItemList plist, string url, string source, NewsItem hottest = null) { try { if (!url.Contains("vnexpress.net")) { url = "http://vnexpress.net" + url; } plist.Clear(); if (hottest != null) { hottest.Source = source; } string html = await HtmlDownloader.loadFromUrl(url); HtmlDocument page = new HtmlDocument(); page.LoadHtml(html); var allItemNodes = page.DocumentNode.SelectNodes("//div[@class='folder-news']"); getHottestNew(page, hottest); if (hottest != null) { plist.Add(hottest); } foreach (HtmlNode itemNode in allItemNodes) { NewsItem article = new NewsItem(); article.Source = source; HtmlNode nodeImage = itemNode.SelectSingleNode(itemNode.XPath + "//img[@src]"); if (nodeImage != null) { article.ImageLink = nodeImage.Attributes["src"].Value; } HtmlNode nodeTitle = itemNode.SelectSingleNode(itemNode.XPath + "//a[@class='link-title14' and @href]"); article.Name = HtmlDownloader.removeHtml(nodeTitle.InnerText); article.LinkUrl = nodeTitle.Attributes["href"].Value; HtmlNode nodeTime = itemNode.SelectSingleNode(itemNode.XPath + "//span[@class='timeListHome']"); if (nodeTime != null) { article.DatePublished = HtmlDownloader.removeHtml(nodeTime.InnerText); } HtmlNode nodeShortContent = itemNode.SelectSingleNode(itemNode.XPath + "//h3[@class='h3Lead']"); if (nodeShortContent != null) { //cut the related link out int cutIndex = nodeShortContent.InnerHtml.IndexOf("<br"); string shortContent = nodeShortContent.InnerHtml; //if find the trash if (cutIndex > 0) { shortContent = nodeShortContent.InnerHtml.Substring(0, cutIndex); } //modify the trash in the content shortContent = shortContent.Replace(">", ""); article.ShortContent = HtmlDownloader.removeHtml(shortContent); } plist.Add(article); } } catch (Exception ex) { MessageBox.Show("System got an error at LoadItemsFromPage function with message:\n" + ex.Message); return; } }
public async static Task LoadContentFrom(NewsItem item, string url) { if (!url.Contains("http://")) { url = "http://vnexpress.net" + url; } string html = await HtmlDownloader.loadFromUrl(url); HtmlDocument page = new HtmlDocument(); page.LoadHtml(html); if (url.Contains("ione.vnexpress")) { LoadFromIOne(page, item); } else if (url.Contains("ngoisao.net")) { await LoadFromNgoiSao(page, item); } else { HtmlNode nodeTag; if (url.Contains("http://vnexpress.net")) { nodeTag = page.DocumentNode.SelectSingleNode("//div[@class='tag-pos']"); } else { nodeTag = page.DocumentNode.SelectSingleNode("//div[@class='content_tagbar']"); } if (nodeTag != null) { var allNodeInTag = nodeTag.SelectNodes(nodeTag.XPath + "//a[@href]"); if (allNodeInTag != null) { foreach (HtmlNode node in allNodeInTag) { ItemTag tag = new ItemTag() { Title = HtmlDownloader.removeHtml(node.InnerText), Link = node.Attributes["href"].Value }; item.addToTagList(tag); tag = null; } } } if (url.Contains("http://vnexpress.net")) { HtmlNode nodeDescription = page.DocumentNode.SelectSingleNode("//h2[@class='Lead']"); if (nodeDescription != null) { item.ShortContent = HtmlDownloader.removeHtml(nodeDescription.InnerText); } HtmlNode nodeTime = page.DocumentNode.SelectSingleNode("//span[@class='spanTime']"); if (nodeTime != null) { item.DatePublished = HtmlDownloader.removeHtml(nodeTime.InnerText); ptichDate(item); } } else { HtmlNode nodeDescription = page.DocumentNode.SelectSingleNode("//div[@class='short_intro']"); item.ShortContent = HtmlDownloader.removeHtml(nodeDescription.InnerText); HtmlNode nodeTime = page.DocumentNode.SelectSingleNode("//div[@class='time txt_666 left txt_11']"); if (nodeTime != null) { item.DatePublished = HtmlDownloader.removeHtml(nodeTime.InnerText); ptichDate(item); } } HtmlNode nodeContent = page.DocumentNode.SelectSingleNode("//div[@class='fck_detail']"); //disable all link if (nodeContent != null) { var allNodeInContent = nodeContent.SelectNodes(nodeContent.XPath + "//*[@href]"); if (allNodeInContent != null) { foreach (HtmlNode node in allNodeInContent) { node.SetAttributeValue("href", ""); //remove reference link } } item.Content = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head>"; item.Content += "<p>" + item.DatePublished + "</p>"; item.Content += "<p><b>Tóm tắt nội dung:</b> <i>" + item.ShortContent + "</i></p>"; item.Content += nodeContent.InnerHtml; item.Content = WebUtility.HtmlEncode(item.Content); } } }
//hàm lấy một list các article từ 1 url của category public static async Task LoadItemsFromPage(NewsItemList plist, string url, string source, NewsItem hottest = null) { try { if (!url.Contains("http://")) { url = "http://dantri.com.vn" + url; } plist.Clear(); if (hottest != null) { hottest.Source = source; } string html = await HtmlDownloader.loadFromUrl(url); HtmlDocument page = new HtmlDocument(); page.LoadHtml(html); var allItemNodes = page.DocumentNode.SelectNodes("//div[@class='mt3 clearfix']"); if (getHottestNew(page, hottest)) { if (hottest != null) { plist.Add(hottest); } } if (allItemNodes == null) { return; } foreach (HtmlNode itemNode in allItemNodes) { NewsItem article = new NewsItem(); article.Source = source; HtmlNode nodeImage = itemNode.SelectSingleNode(itemNode.XPath + "//img[@src]"); if (nodeImage != null) { article.ImageLink = nodeImage.Attributes["src"].Value; } var nodeTitle = itemNode.SelectNodes(itemNode.XPath + "//a"); foreach (HtmlNode node in nodeTitle) { if (node.Attributes.Contains("href")) { if (article.LinkUrl == null || article.LinkUrl == "") { article.LinkUrl = node.Attributes["href"].Value; } } if (node.Attributes.Contains("title")) { if (article.Name == null || article.Name == "") { article.Name = HtmlDownloader.removeHtml(node.Attributes["title"].Value); } } if (!node.InnerHtml.Contains("<img")) { if (node.InnerText.Length > 10) { if (article.Name == null || article.Name == "") { article.Name = HtmlDownloader.removeHtml(node.InnerText); } } } } if (article.LinkUrl.Contains("tuyensinh.dantri")) { continue; } HtmlNode nodeShortContent = itemNode.SelectSingleNode(itemNode.XPath + "//div[@class='fon5 wid324 fl']"); if (nodeShortContent == null) { nodeShortContent = itemNode.SelectSingleNode(itemNode.XPath + "//div[@class='fon5 fl']"); } if (nodeShortContent != null) { //cut the related link out int cutIndex = nodeShortContent.InnerHtml.IndexOf("<br"); string shortContent = nodeShortContent.InnerHtml; //if find the trash if (cutIndex > 0) { shortContent = nodeShortContent.InnerHtml.Substring(0, cutIndex); } //modify the trash in the content shortContent = shortContent.Replace(">", ""); article.ShortContent = HtmlDownloader.removeHtml(shortContent); } plist.Add(article); } } catch (Exception ex) { MessageBox.Show("System got an error at LoadItemsFromPage function with message:\n" + ex.Message); return; } }