示例#1
0
        //hàm lấy content của một article
        public static async Task LoadContentFrom(NewsItem item, string url)
        {
            if (!url.Contains("http://"))
            {
                url = "http://dantri.com.vn" + url;
            }
            string html = await HtmlDownloader.loadFromUrl(url);

            HtmlDocument page = new HtmlDocument();

            page.LoadHtml(html);
            HtmlNode nodeContent = page.DocumentNode.SelectSingleNode("//div[@class='fon34 mt3 mr2 fon43']");
            HtmlNode nodeTag     = page.DocumentNode.SelectSingleNode("//div[@class='news-tag']");

            if (nodeTag != null)
            {
                var allNodeInTag = nodeTag.SelectNodes(nodeTag.XPath + "//a[@href]");
                if (allNodeInTag != null)
                {
                    foreach (HtmlNode node in allNodeInTag)
                    {
                        ItemTag tag = new ItemTag()
                        {
                            Title = HtmlDownloader.removeHtml(node.InnerText),
                            Link  = node.Attributes["href"].Value
                        };
                        item.addToTagList(tag);
                        tag = null;
                    }
                }
            }


            //disable all link
            var allNodeInContent = nodeContent.SelectNodes(nodeContent.XPath + "//*[@href]");

            if (allNodeInContent != null)
            {
                foreach (HtmlNode node in allNodeInContent)
                {
                    node.SetAttributeValue("href", ""); //remove reference link
                }
            }
            int positionToDel = nodeContent.InnerHtml.IndexOf("<div class=\"news-tag\">");

            if (positionToDel > 0)
            {
                nodeContent.InnerHtml = nodeContent.InnerHtml.Substring(0, positionToDel);
            }
            HtmlNode nodeTime = page.DocumentNode.SelectSingleNode("//span[@class='fr fon7 mr2']");

            item.DatePublished = HtmlDownloader.removeHtml(nodeTime.InnerText);
            ptichDate(item);
            item.Content  = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head>";
            item.Content += "<p>" + item.DatePublished + "</p>";
            item.Content += "<p><b>Tóm tắt nội dung:</b> <i>" + item.ShortContent + "</i></p>";
            item.Content += nodeContent.InnerHtml.Replace(nodeTag.OuterHtml, "");
            item.Content  = WebUtility.HtmlEncode(item.Content);
        }
示例#2
0
        //load content from ngoisao.net
        public async static Task LoadFromNgoiSao(HtmlDocument page, NewsItem item)
        {
            //load time
            HtmlNode nodeTime = page.DocumentNode.SelectSingleNode("//span[@class='spanDateTime fl']");

            if (nodeTime != null)
            {
                item.DatePublished = HtmlDownloader.removeHtml(nodeTime.InnerText);
                ptichDate(item);
            }
            //load item tag
            HtmlNode nodeTag = page.DocumentNode.SelectSingleNode("//div[@class='wordTag']");

            if (nodeTag != null)
            {
                var allNodeInTag = nodeTag.SelectNodes(nodeTag.XPath + "//a[@href]");
                if (allNodeInTag != null)
                {
                    foreach (HtmlNode node in allNodeInTag)
                    {
                        ItemTag tag = new ItemTag()
                        {
                            Title = HtmlDownloader.removeHtml(node.InnerText),
                            Link  = node.Attributes["href"].Value
                        };
                        item.addToTagList(tag);
                        tag = null;
                    }
                }
            }
            //load item succient content
            HtmlNode nodeDescription = page.DocumentNode.SelectSingleNode("//h2[@class='lead']");

            if (nodeDescription != null)
            {
                item.ShortContent = HtmlDownloader.removeHtml(nodeDescription.InnerText);
            }
            HtmlNode nodeId = page.DocumentNode.SelectSingleNode("//meta[@name='tt_article_id']");
            string   id     = nodeId.Attributes["content"].Value;
            string   html   = await HtmlDownloader.loadFromUrl("http://ngoisao.net/detail/print?id=" + id);

            HtmlDocument printPage = new HtmlDocument();

            printPage.LoadHtml(html);
            //load page content
            HtmlNode nodeContent      = printPage.DocumentNode.SelectSingleNode("//div[@class='fck_detail']");
            var      allNodeInContent = nodeContent.SelectNodes(nodeContent.XPath + "//*[@href]");

            if (allNodeInContent != null)
            {
                foreach (HtmlNode node in allNodeInContent)
                {
                    node.SetAttributeValue("href", ""); //remove reference link
                }
            }
            item.Content  = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head>";
            item.Content += "<p>" + item.DatePublished + "</p>";
            item.Content += "<p><b>Tóm tắt nội dung:</b> <i>" + item.ShortContent + "</i></p>";
            item.Content += nodeContent.InnerHtml;
            item.Content  = WebUtility.HtmlEncode(item.Content);
        }
示例#3
0
        public async static Task LoadContentFrom(NewsItem item, string url)
        {
            if (!url.Contains("http://"))
            {
                url = "http://vnexpress.net" + url;
            }
            string html = await HtmlDownloader.loadFromUrl(url);

            HtmlDocument page = new HtmlDocument();

            page.LoadHtml(html);
            if (url.Contains("ione.vnexpress"))
            {
                LoadFromIOne(page, item);
            }
            else if (url.Contains("ngoisao.net"))
            {
                await LoadFromNgoiSao(page, item);
            }
            else
            {
                HtmlNode nodeTag;
                if (url.Contains("http://vnexpress.net"))
                {
                    nodeTag = page.DocumentNode.SelectSingleNode("//div[@class='tag-pos']");
                }
                else
                {
                    nodeTag = page.DocumentNode.SelectSingleNode("//div[@class='content_tagbar']");
                }
                if (nodeTag != null)
                {
                    var allNodeInTag = nodeTag.SelectNodes(nodeTag.XPath + "//a[@href]");
                    if (allNodeInTag != null)
                    {
                        foreach (HtmlNode node in allNodeInTag)
                        {
                            ItemTag tag = new ItemTag()
                            {
                                Title = HtmlDownloader.removeHtml(node.InnerText),
                                Link  = node.Attributes["href"].Value
                            };
                            item.addToTagList(tag);
                            tag = null;
                        }
                    }
                }
                if (url.Contains("http://vnexpress.net"))
                {
                    HtmlNode nodeDescription = page.DocumentNode.SelectSingleNode("//h2[@class='Lead']");
                    if (nodeDescription != null)
                    {
                        item.ShortContent = HtmlDownloader.removeHtml(nodeDescription.InnerText);
                    }
                    HtmlNode nodeTime = page.DocumentNode.SelectSingleNode("//span[@class='spanTime']");
                    if (nodeTime != null)
                    {
                        item.DatePublished = HtmlDownloader.removeHtml(nodeTime.InnerText);
                        ptichDate(item);
                    }
                }
                else
                {
                    HtmlNode nodeDescription = page.DocumentNode.SelectSingleNode("//div[@class='short_intro']");
                    item.ShortContent = HtmlDownloader.removeHtml(nodeDescription.InnerText);
                    HtmlNode nodeTime = page.DocumentNode.SelectSingleNode("//div[@class='time txt_666 left txt_11']");
                    if (nodeTime != null)
                    {
                        item.DatePublished = HtmlDownloader.removeHtml(nodeTime.InnerText);
                        ptichDate(item);
                    }
                }
                HtmlNode nodeContent = page.DocumentNode.SelectSingleNode("//div[@class='fck_detail']");
                //disable all link
                if (nodeContent != null)
                {
                    var allNodeInContent = nodeContent.SelectNodes(nodeContent.XPath + "//*[@href]");
                    if (allNodeInContent != null)
                    {
                        foreach (HtmlNode node in allNodeInContent)
                        {
                            node.SetAttributeValue("href", ""); //remove reference link
                        }
                    }
                    item.Content  = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head>";
                    item.Content += "<p>" + item.DatePublished + "</p>";
                    item.Content += "<p><b>Tóm tắt nội dung:</b> <i>" + item.ShortContent + "</i></p>";
                    item.Content += nodeContent.InnerHtml;
                    item.Content  = WebUtility.HtmlEncode(item.Content);
                }
            }
        }
示例#4
0
        //load content from ione.vnexpress.net
        private static void LoadFromIOne(HtmlDocument page, NewsItem item)
        {
            //load time
            HtmlNode nodeTime = page.DocumentNode.SelectSingleNode("//div[@class='time left']");

            if (nodeTime != null)
            {
                string date = HtmlDownloader.removeHtml(nodeTime.InnerText);
                item.DatePublished = date;
                date = date.Replace("AM", "");
                date = date.Replace("PM", "");
                string[] time = date.Split('|');
                if (time.Length == 2)
                {
                    time[0] = HtmlDownloader.removeHtml(time[0]);
                    time[1] = HtmlDownloader.removeHtml(time[1]);
                    string[] day = time[1].Split('/');
                    if (day.Length == 3)
                    {
                        if (day[1].Length == 1)
                        {
                            day[1] = "0" + day[1];
                        }
                        if (day[0].Length == 1)
                        {
                            day[0] = "0" + day[1];
                        }
                        time[1] = day[2] + day[1] + day[0];
                    }
                    item.DateStandard = time[1] + time[0];
                }
            }
            //load item tag
            HtmlNode nodeTag      = page.DocumentNode.SelectSingleNode("//div[@class='left w600 list_tags']");
            var      allNodeInTag = nodeTag.SelectNodes(nodeTag.XPath + "//a[@href]");

            if (allNodeInTag != null)
            {
                foreach (HtmlNode node in allNodeInTag)
                {
                    ItemTag tag = new ItemTag()
                    {
                        Title = HtmlDownloader.removeHtml(node.InnerText),
                        Link  = node.Attributes["href"].Value
                    };
                    item.addToTagList(tag);
                    tag = null;
                }
            }

            //load item succient content
            HtmlNode nodeDescription = page.DocumentNode.SelectSingleNode("//div[@class='lead']");

            item.ShortContent = HtmlDownloader.removeHtml(nodeDescription.InnerText);
            //load item content
            HtmlNode nodeContent      = page.DocumentNode.SelectSingleNode("//div[@class='content']");
            var      allNodeInContent = nodeContent.SelectNodes(nodeContent.XPath + "//*[@href]");

            if (allNodeInContent != null)
            {
                foreach (HtmlNode node in allNodeInContent)
                {
                    node.SetAttributeValue("href", ""); //remove reference link
                }
            }
            item.Content  = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"></head>";
            item.Content += "<p>" + item.DatePublished + "</p>";
            item.Content += "<p><b>Tóm tắt nội dung:</b> <i>" + item.ShortContent + "</i></p>";
            item.Content += nodeContent.InnerHtml;
            item.Content  = WebUtility.HtmlEncode(item.Content);
        }