void GetNewsText(string url) { NReadabilityWebTranscoder TwebTranscoder = new NReadability.NReadabilityWebTranscoder(); bool b; string detail; string page = TwebTranscoder.Transcode(url, DomSerializationParams.CreateDefault(), out b, out detail); if (b) { HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(page); var body = doc.DocumentNode.SelectSingleNode("//div[@id='readInner']").InnerText; string title = doc.DocumentNode.SelectSingleNode("//title").InnerText; richTextBox1.Text = title.Trim(); richTextBox2.Text = detail.Trim(); richTextBox3.Text = body.Trim(); } }
public string GetContent(string url) { var readability = new NReadabilityWebTranscoder(); bool extracted; var read = readability.Transcode(url, out extracted); var document = new HtmlAgilityPack.HtmlDocument(); document.LoadHtml(read); return document.DocumentNode.Descendants("div").Single(p => p.Id == "readInner").InnerHtml; }
/// <summary> /// Initializes a new instance of the <see cref="Appleseed.Base.Data.WebPageRepository"/> class. /// </summary> /// <param name="incomingConnectionURL">Incoming connection UR.</param> /// <param name="logger">Logger.</param> public WebPageDataService (string incomingConnectionURL, Logger logger) { this.Log = logger; this.ConnectionURL = incomingConnectionURL; //DONE: implement a page scraper via NReadability or an API // maybe use a REST service to get image/ etc and NReadability to the article itself // https://www.mashape.com/pbkwee/html2text + http://scraper.io/ try { NReadabilityWebTranscoder wt = new NReadabilityWebTranscoder (); WebTranscodingResult wtr = wt.Transcode (new WebTranscodingInput (this.ConnectionURL)); this.ExtractedContent = wtr.ExtractedContent; this.ExtractedTitle = wtr.ExtractedTitle; this.ExtractedImage = ""; } catch (Exception ex) { Log.ErrorException ("Error", ex); } }
private string getcontent(string url) { var t = new NReadability.NReadabilityWebTranscoder(); bool b; string detail = ""; string page = t.Transcode(url, DomSerializationParams.CreateDefault(), out b, out detail); if (b) { HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(page); // var imgUrl = doc.DocumentNode.SelectSingleNode("//meta[@property='og:image']").Attributes["content"].Value; var dd = doc.DocumentNode.SelectSingleNode("//div[@id='readInner']").InnerText; var tt = doc.DocumentNode.SelectSingleNode("//title").InnerText; richTextBox2.Text = detail; richTextBox1.Text = dd; return(dd); } return(""); }
private CleanText getCleanText_Old(string url, string content) { var transcoder = new NReadabilityWebTranscoder(new NReadabilityTranscoder(), new UrlFetcher(content)); bool success; try { //transcoder.Ti string text = transcoder.Transcode(url, out success); if (success) { HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(text); var title = ""; if (doc.DocumentNode.SelectSingleNode("//title") != null) title = doc.DocumentNode.SelectSingleNode("//title").InnerText; var imgUrl = ""; var imgNode = doc.DocumentNode.SelectSingleNode("//meta[@property='og:image']"); if (imgNode != null) { if (imgNode.Attributes["content"] != null) imgUrl = imgNode.Attributes["content"].Value; } var mainText = ""; if (doc.DocumentNode.SelectSingleNode("//div[@id='readInner']") != null) mainText = doc.DocumentNode.SelectSingleNode("//div[@id='readInner']").InnerText; return new CleanText { Title = title, Image = imgUrl, Content = mainText, Url = url, FetchDate = DateTime.Now }; } else { return new CleanText { Title = "Content not found", Image = "", Content = "", Url = url, FetchDate = DateTime.Now }; } } catch (Exception ex) { return new CleanText { Title = "Content not found", Image = ex.ToStringBetter(), Content = "", Url = url, FetchDate = DateTime.Now }; } }
private static string embed(string url, out string title) { title = url; Console.WriteLine("Embeding " + url); if (url.Contains("youtube.com/")) { string vidid = ""; Regex s = new Regex(@"^.*((youtu.be\/)|(v\/)|(embed\/)|(watch\?))\??v?=?([^#\&\?]*).*"); Match ss = s.Match(url); if (ss.Success) { vidid = ss.Groups[6].Value; return " <iframe width=\"560\" height=\"349\" src=\"http://www.youtube.com/embed/" + vidid + "\" frameborder=\"0\" allowfullscreen></iframe> "; } } string embedlyurl = "http://api.embed.ly/1/oembed?url=" + url; string embedoutput = ""; try { embedoutput = c.DownloadString(embedlyurl); } catch (Exception e) { } Regex html = new Regex(@"""html"": ""(.*?)\"","); Regex urlre = new Regex(@"""url"": ""(.*?)\"",.*?""width"": (.*?),"); Regex type = new Regex(@"""type"": ""(.*?)"""); Regex titles = new Regex(@"""title"": ""(.*?)"""); Match embtype = type.Match(embedoutput); if (embtype.Success) { Match embtitle = titles.Match(embedoutput); if (embtitle.Success) { title = embtitle.Groups[1].Value; } if (embtype.Groups[1].Value == "photo") { Match emburl = urlre.Match(embedoutput); if (emburl.Success) { string width = emburl.Groups[2].Value; int iwidth = 0; int.TryParse(width, out iwidth); if (iwidth > 700) { return "<img src=\"" + emburl.Groups[1].Value + "\" width=\"700\"/>"; } else { return "<img src=\"" + emburl.Groups[1].Value + "\"/>"; } } } else { Match embhtml = html.Match(embedoutput); if (embhtml.Success) { return embhtml.Groups[1].Value; } } } try { // string realhtml = c.DownloadString(url); //NReadabilityTranscoder rd = new NReadabilityTranscoder(); NReadabilityWebTranscoder rdw = new NReadabilityWebTranscoder(); bool extracted = false; // string transcoded = rd.Transcode(realhtml, out extracted); string transcoded = rdw.Transcode(url, out extracted); if (extracted) { Regex body = new Regex("<body>(.*?)</body>", RegexOptions.Singleline); Match bodym = body.Match(transcoded); if (bodym.Success) { transcoded = "<html><body>" + bodym.Groups[1].Value + "</body></html>"; } Regex header = new Regex("(<h1>.*?</h1>)", RegexOptions.Singleline); transcoded = header.Replace(transcoded, ""); string realhtml = c.DownloadString(url); Regex regexs = new Regex("<title>(.*?)</title>", RegexOptions.IgnoreCase); Match match = regexs.Match(realhtml); if (match.Success) { title = match.Groups[1].Value; } return transcoded; } } catch (Exception e) { try { string realhtml = c.DownloadString(url); Regex regexs = new Regex(".*<head>.*<title>(.*)</title>.*</head>.*", RegexOptions.IgnoreCase); Match match = regexs.Match(realhtml); if (match.Success) { title = match.Groups[0].Value; } realhtml = SanitizeXmlString(realhtml); bool extracted = false; NReadabilityTranscoder rd = new NReadabilityTranscoder(); //NReadabilityWebTranscoder rdw = new NReadabilityWebTranscoder(); // string transcoded = rd.Transcode(realhtml, out extracted); string transcoded = rd.Transcode(realhtml, url, out extracted); if (extracted) { Regex body = new Regex("<body>(.*?)</body>", RegexOptions.Singleline); Match bodym = body.Match(transcoded); if (bodym.Success) { transcoded = "<html><body>" + bodym.Groups[1].Value + "</body></html>"; } Regex header = new Regex("(<h1>.*?</h1>)", RegexOptions.Singleline); transcoded = header.Replace(transcoded, ""); return transcoded; } } catch (Exception ex) { try { string realhtml = SanitizeXmlString(c.DownloadString(url)); Regex regexs = new Regex(".*<head>.*<title>(.*)</title>.*</head>.*", RegexOptions.IgnoreCase); Match match = regexs.Match(realhtml); if (match.Success) { title = match.Groups[0].Value; } bool extracted = false; using (Document doc = Document.FromString(realhtml)) { doc.ShowWarnings = false; doc.Quiet = true; doc.OutputXhtml = true; doc.CleanAndRepair(); realhtml = doc.Save(); } NReadabilityTranscoder rd = new NReadabilityTranscoder(); //NReadabilityWebTranscoder rdw = new NReadabilityWebTranscoder(); // string transcoded = rd.Transcode(realhtml, out extracted); string transcoded = rd.Transcode(realhtml, url, out extracted); if (extracted) { Regex body = new Regex("<body>(.*?)</body>", RegexOptions.Singleline); Match bodym = body.Match(transcoded); if (bodym.Success) { transcoded = "<html><body>" + bodym.Groups[1].Value + "</body></html>"; } Regex header = new Regex("(<h1>.*?</h1>)", RegexOptions.Singleline); transcoded = header.Replace(transcoded, ""); return transcoded; } } catch (Exception exx) { } } } return ""; }
private static Tuple<string, string, string> getCleanText(string url) { var transcoder = new NReadabilityWebTranscoder(); bool success; try { string text = transcoder.Transcode(url, out success); if (success) { HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(text); var title = doc.DocumentNode.SelectSingleNode("//title").InnerText; var imgUrl = ""; var imgNode = doc.DocumentNode.SelectSingleNode("//meta[@property='og:image']"); if (imgNode != null) imgUrl = doc.DocumentNode.SelectSingleNode("//meta[@property='og:image']").Attributes["content"].Value; var mainText = doc.DocumentNode.SelectSingleNode("//div[@id='readInner']").InnerText; return new Tuple<string, string, string>(title, imgUrl, mainText); } else { return new Tuple<string, string, string>("#FAIL#", "", ""); } } catch (Exception ex) { return new Tuple<string, string, string>("#FAIL#", ex.ToStringBetter(), ""); } }