Пример #1
0
        void GetNewsText(string url)
        {
            NReadabilityWebTranscoder TwebTranscoder = new NReadability.NReadabilityWebTranscoder();
            bool   b;
            string detail;


            string page = TwebTranscoder.Transcode(url, DomSerializationParams.CreateDefault(), out b, out detail);


            if (b)
            {
                HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
                doc.LoadHtml(page);



                var    body  = doc.DocumentNode.SelectSingleNode("//div[@id='readInner']").InnerText;
                string title = doc.DocumentNode.SelectSingleNode("//title").InnerText;


                richTextBox1.Text = title.Trim();
                richTextBox2.Text = detail.Trim();
                richTextBox3.Text = body.Trim();
            }
        }
        public string GetContent(string url)
        {
            var readability = new NReadabilityWebTranscoder();            
            
            bool extracted;
            var read = readability.Transcode(url, out extracted);

            var document = new HtmlAgilityPack.HtmlDocument();
            document.LoadHtml(read);

            return document.DocumentNode.Descendants("div").Single(p => p.Id == "readInner").InnerHtml;
        }
		/// <summary>
		/// Initializes a new instance of the <see cref="Appleseed.Base.Data.WebPageRepository"/> class.
		/// </summary>
		/// <param name="incomingConnectionURL">Incoming connection UR.</param>
		/// <param name="logger">Logger.</param>
		public WebPageDataService (string incomingConnectionURL, Logger logger)
		{
			this.Log = logger;
			this.ConnectionURL = incomingConnectionURL;

			//DONE: implement a page scraper via NReadability or an API 
			// maybe use a REST service to get image/ etc and NReadability to the article itself
			// https://www.mashape.com/pbkwee/html2text + http://scraper.io/

			try {
				NReadabilityWebTranscoder wt = new NReadabilityWebTranscoder ();
				WebTranscodingResult wtr = wt.Transcode (new WebTranscodingInput (this.ConnectionURL));

				this.ExtractedContent = wtr.ExtractedContent;
				this.ExtractedTitle = wtr.ExtractedTitle;
				this.ExtractedImage = "";
			} catch (Exception ex) {
				Log.ErrorException ("Error", ex);
			} 
		}
Пример #4
0
        private string getcontent(string url)
        {
            var    t = new NReadability.NReadabilityWebTranscoder();
            bool   b;
            string detail = "";
            string page   = t.Transcode(url, DomSerializationParams.CreateDefault(), out b, out detail);

            if (b)
            {
                HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
                doc.LoadHtml(page);


                // var imgUrl = doc.DocumentNode.SelectSingleNode("//meta[@property='og:image']").Attributes["content"].Value;
                var dd = doc.DocumentNode.SelectSingleNode("//div[@id='readInner']").InnerText;
                var tt = doc.DocumentNode.SelectSingleNode("//title").InnerText;
                richTextBox2.Text = detail;
                richTextBox1.Text = dd;
                return(dd);
            }
            return("");
        }
Пример #5
0
        private CleanText getCleanText_Old(string url, string content)
        {
            var transcoder = new NReadabilityWebTranscoder(new NReadabilityTranscoder(), new UrlFetcher(content));
            bool success;
            try
            {
                //transcoder.Ti
                string text = transcoder.Transcode(url, out success);

                if (success)
                {
                    HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
                    doc.LoadHtml(text);

                    var title = "";
                    if (doc.DocumentNode.SelectSingleNode("//title") != null)
                        title = doc.DocumentNode.SelectSingleNode("//title").InnerText;
                    var imgUrl = "";
                    var imgNode = doc.DocumentNode.SelectSingleNode("//meta[@property='og:image']");
                    if (imgNode != null)
                    {
                        if (imgNode.Attributes["content"] != null)
                            imgUrl = imgNode.Attributes["content"].Value;
                    }
                    var mainText = "";
                    if (doc.DocumentNode.SelectSingleNode("//div[@id='readInner']") != null)
                        mainText = doc.DocumentNode.SelectSingleNode("//div[@id='readInner']").InnerText;

                    return new CleanText { Title = title, Image = imgUrl, Content = mainText, Url = url, FetchDate = DateTime.Now };
                }
                else
                {
                    return new CleanText { Title = "Content not found", Image = "", Content = "", Url = url, FetchDate = DateTime.Now };
                }
            }
            catch (Exception ex)
            {
                return new CleanText { Title = "Content not found", Image = ex.ToStringBetter(), Content = "", Url = url, FetchDate = DateTime.Now };
            }
        }
Пример #6
0
        private static string embed(string url, out string title)
        {
            title = url;
            Console.WriteLine("Embeding " + url);
            if (url.Contains("youtube.com/"))
            {
                string vidid = "";
                Regex s = new Regex(@"^.*((youtu.be\/)|(v\/)|(embed\/)|(watch\?))\??v?=?([^#\&\?]*).*");
                Match ss = s.Match(url);
                if (ss.Success)
                {
                    vidid = ss.Groups[6].Value;
                    return " <iframe width=\"560\" height=\"349\" src=\"http://www.youtube.com/embed/" + vidid + "\" frameborder=\"0\" allowfullscreen></iframe> ";

                }
            }

            string embedlyurl = "http://api.embed.ly/1/oembed?url=" + url;
            string embedoutput = "";
            try
            {
                embedoutput = c.DownloadString(embedlyurl);
            }
            catch (Exception e)
            {

            }
            Regex html = new Regex(@"""html"": ""(.*?)\"",");
            Regex urlre = new Regex(@"""url"": ""(.*?)\"",.*?""width"": (.*?),");
            Regex type = new Regex(@"""type"": ""(.*?)""");
            Regex titles = new Regex(@"""title"": ""(.*?)""");
            Match embtype = type.Match(embedoutput);
            if (embtype.Success)
            {
                Match embtitle = titles.Match(embedoutput);
                if (embtitle.Success)
                {
                    title = embtitle.Groups[1].Value;
                }

                if (embtype.Groups[1].Value == "photo")
                {
                    Match emburl = urlre.Match(embedoutput);
                    if (emburl.Success)
                    {
                        string width = emburl.Groups[2].Value;
                        int iwidth = 0;
                        int.TryParse(width, out iwidth);
                        if (iwidth > 700)
                        {
                            return "<img src=\"" + emburl.Groups[1].Value + "\" width=\"700\"/>";
                        }
                        else
                        {
                            return "<img src=\"" + emburl.Groups[1].Value + "\"/>";
                        }
                    }
                }
                else
                {
                    Match embhtml = html.Match(embedoutput);
                    if (embhtml.Success)
                    {
                        return embhtml.Groups[1].Value;
                    }
                }

            }
            try
            {

                // string realhtml = c.DownloadString(url);
                //NReadabilityTranscoder rd = new NReadabilityTranscoder();
                NReadabilityWebTranscoder rdw = new NReadabilityWebTranscoder();

                bool extracted = false;

                // string transcoded = rd.Transcode(realhtml, out extracted);
                string transcoded = rdw.Transcode(url, out extracted);
                if (extracted)
                {
                    Regex body = new Regex("<body>(.*?)</body>", RegexOptions.Singleline);
                    Match bodym = body.Match(transcoded);
                    if (bodym.Success)
                    {
                        transcoded = "<html><body>" + bodym.Groups[1].Value + "</body></html>";
                    }
                    Regex header = new Regex("(<h1>.*?</h1>)", RegexOptions.Singleline);
                    transcoded = header.Replace(transcoded, "");
                    string realhtml = c.DownloadString(url);
                    Regex regexs = new Regex("<title>(.*?)</title>",
                        RegexOptions.IgnoreCase);
                    Match match = regexs.Match(realhtml);
                    if (match.Success)
                    {
                        title = match.Groups[1].Value;
                    }
                    return transcoded;
                }
            }
            catch (Exception e)
            {
                try
                {
                    string realhtml = c.DownloadString(url);
                    Regex regexs = new Regex(".*<head>.*<title>(.*)</title>.*</head>.*",
                        RegexOptions.IgnoreCase);
                    Match match = regexs.Match(realhtml);
                    if (match.Success)
                    {
                        title = match.Groups[0].Value;
                    }
                    realhtml = SanitizeXmlString(realhtml);
                    bool extracted = false;

                    NReadabilityTranscoder rd = new NReadabilityTranscoder();
                    //NReadabilityWebTranscoder rdw = new NReadabilityWebTranscoder();

                    // string transcoded = rd.Transcode(realhtml, out extracted);
                    string transcoded = rd.Transcode(realhtml, url, out extracted);
                    if (extracted)
                    {
                        Regex body = new Regex("<body>(.*?)</body>", RegexOptions.Singleline);
                        Match bodym = body.Match(transcoded);
                        if (bodym.Success)
                        {
                            transcoded = "<html><body>" + bodym.Groups[1].Value + "</body></html>";
                        }
                        Regex header = new Regex("(<h1>.*?</h1>)", RegexOptions.Singleline);
                        transcoded = header.Replace(transcoded, "");
                        return transcoded;
                    }
                }
                catch (Exception ex)
                {
                    try
                    {
                        string realhtml = SanitizeXmlString(c.DownloadString(url));
                        Regex regexs = new Regex(".*<head>.*<title>(.*)</title>.*</head>.*",
                          RegexOptions.IgnoreCase);
                        Match match = regexs.Match(realhtml);
                        if (match.Success)
                        {
                            title = match.Groups[0].Value;
                        }
                        bool extracted = false;
                        using (Document doc = Document.FromString(realhtml))
                        {
                            doc.ShowWarnings = false;
                            doc.Quiet = true;
                            doc.OutputXhtml = true;
                            doc.CleanAndRepair();
                            realhtml = doc.Save();
                        }
                        NReadabilityTranscoder rd = new NReadabilityTranscoder();
                        //NReadabilityWebTranscoder rdw = new NReadabilityWebTranscoder();

                        // string transcoded = rd.Transcode(realhtml, out extracted);
                        string transcoded = rd.Transcode(realhtml, url, out extracted);
                        if (extracted)
                        {
                            Regex body = new Regex("<body>(.*?)</body>", RegexOptions.Singleline);
                            Match bodym = body.Match(transcoded);
                            if (bodym.Success)
                            {
                                transcoded = "<html><body>" + bodym.Groups[1].Value + "</body></html>";
                            }
                            Regex header = new Regex("(<h1>.*?</h1>)", RegexOptions.Singleline);
                            transcoded = header.Replace(transcoded, "");
                            return transcoded;
                        }

                    }
                    catch (Exception exx)
                    {

                    }

                }
            }
            return "";
        }
Пример #7
0
        private static Tuple<string, string, string> getCleanText(string url)
        {
            var transcoder = new NReadabilityWebTranscoder();
            bool success;
            try
            {
                string text = transcoder.Transcode(url, out success);

                if (success)
                {
                    HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
                    doc.LoadHtml(text);

                    var title = doc.DocumentNode.SelectSingleNode("//title").InnerText;
                    var imgUrl = "";
                    var imgNode = doc.DocumentNode.SelectSingleNode("//meta[@property='og:image']");
                    if (imgNode != null) imgUrl = doc.DocumentNode.SelectSingleNode("//meta[@property='og:image']").Attributes["content"].Value;
                    var mainText = doc.DocumentNode.SelectSingleNode("//div[@id='readInner']").InnerText;

                    return new Tuple<string, string, string>(title, imgUrl, mainText);
                }
                else
                {
                    return new Tuple<string, string, string>("#FAIL#", "", "");
                }
            }
            catch (Exception ex)
            {
                return new Tuple<string, string, string>("#FAIL#", ex.ToStringBetter(), "");
            }
        }