/// <summary> /// Initializes a new instance of NReadabilityWebTranscoder. /// Allows passing in custom-constructed NReadabilityTranscoder, /// and a custom IUrlFetcher. /// </summary> /// <param name="transcoder">A NReadabilityTranscoder.</param> /// <param name="urlFetcher">IFetcher instance to download content.</param> /// <param name="pageSeparatorBuilder">A function that creates a HTML fragment for page separator. It takes the page number as an argument.</param> public NReadabilityWebTranscoder(NReadabilityTranscoder transcoder, IUrlFetcher urlFetcher, Func<int, string> pageSeparatorBuilder) { _transcoder = transcoder; _urlFetcher = urlFetcher; _sgmlDomSerializer = new SgmlDomSerializer(); _pageSeparatorBuilder = pageSeparatorBuilder; }
/// <summary> /// Initializes a new instance of NReadabilityWebTranscoder. /// Allows passing in custom-constructed NReadabilityTranscoder, /// and a custom IUrlFetcher. This overload is mostly used for testing. /// </summary> /// <param name="transcoder">A NReadabilityTranscoder.</param> /// <param name="urlFetcher">IFetcher instance to download content.</param> public NReadabilityWebTranscoder(NReadabilityTranscoder transcoder, IUrlFetcher urlFetcher) { _transcoder = transcoder; _urlFetcher = urlFetcher; _sgmlDomSerializer = new SgmlDomSerializer(); _pageSeparatorBuilder = _DefaultPageSeparatorBuilder; }
public void Download(string url, string targetDir, string targetFname) { string fname = targetFname.RemoveColon(); string filepath = Path.Combine(targetDir, fname); //ensure it respects mppl filepath = Utilities.TrimPathPart(filepath, _futureleanCourse.Max_path_part_len); WebHeaderCollection responseHeaders = _futureleanCourse._client.ResponseHeaders; int contentLength = GetContentLength(responseHeaders); bool isFileNeeded = IsFileNeeded(filepath, contentLength, fname); if (isFileNeeded) { if (Path.GetExtension(filepath) == ".html") { string content = _futureleanCourse._client.DownloadString(url); NReadabilityTranscoder transcoder = new NReadabilityTranscoder(); TranscodingInput tiInput = new TranscodingInput(content); TranscodingResult transcodedContent = transcoder.Transcode(tiInput); //.Transcode(content, out success); File.WriteAllText(filepath, transcodedContent.ExtractedContent); } else { _futureleanCourse._client.DownloadFile(url, filepath); } } }
public string GetArticle(string filepath) { FileStream file =File.OpenRead(filepath); StreamReader reader = new StreamReader(file); string Html = reader.ReadToEnd(); reader.Close(); file.Close(); bool success = false; NReadabilityTranscoder util = new NReadabilityTranscoder(); string Article = util.Transcode(Html, out success); return Article; }
private static String GetWebpageContents(String url) { var nreadabilityTranscoder = new NReadabilityTranscoder(); using (var wc = new WebClient()) { var rawHtml = wc.DownloadString(url); var transcodingInput = new TranscodingInput(rawHtml); var extractedHtml = nreadabilityTranscoder.Transcode(transcodingInput).ExtractedContent; var pageHtml = new HtmlDocument(); pageHtml.LoadHtml(extractedHtml); return pageHtml.DocumentNode.SelectSingleNode("//body").InnerText; } }
public static string getTitle(string url) { bool mainContentExtracted; //結果 string source = ""; string title = ""; //トランスコーダー NReadabilityTranscoder nReadabilityTranscoder = new NReadabilityTranscoder(); //パーサー HtmlParser hp = new HtmlParser(); //仮想ブラウザ NonDispBrowser nb = new NonDispBrowser(); //HTMLの取得 source = hp.getHtmlSource(url); try { nb.NavigateAndWaitFromSource(hp.getHtmlPlainTextFromSourceWB(nReadabilityTranscoder.Transcode(source, out mainContentExtracted))); title = nb.Document.Title; } catch { } finally { //確実に破棄 nb.Dispose(); } //結果を返す return title; //return hp.getHtmlPlainTextFromSource(nReadabilityTranscoder.Transcode(getHtmlSource(url), out mainContentExtracted)); }
public static string transeForJapa(string url) { bool mainContentExtracted; //結果 string result = ""; string source = ""; string title = ""; //トランスコーダー NReadabilityTranscoder nReadabilityTranscoder = new NReadabilityTranscoder(); //パーサー HtmlParser hp = new HtmlParser(); //仮想ブラウザ using (NonDispBrowser nb = new NonDispBrowser()) { //HTMLの取得 source = hp.getHtmlSource(url); try { //まずは要約データからボディの取得を試みる nb.NavigateAndWaitFromSource(hp.getHtmlPlainTextFromSourceWB(nReadabilityTranscoder.Transcode(source, out mainContentExtracted))); title = nb.Document.Title; result = nb.Document.Body.InnerText.Replace(title, ""); if (result != "") { return result; } result = HtmlParser.htmlGomiRegularRemove(HtmlParser.htmlTagRegularRemove(source)); } catch { } } //結果を返す return result; //return hp.getHtmlPlainTextFromSource(nReadabilityTranscoder.Transcode(getHtmlSource(url), out mainContentExtracted)); }
private CleanText getCleanText(string url, string content) { var transcoder = new NReadabilityTranscoder(); bool success; try { //transcoder.Ti TranscodingResult textRes = transcoder.Transcode(new TranscodingInput(content)); if (textRes.ContentExtracted) { var title = ""; if (textRes.TitleExtracted) title = textRes.ExtractedTitle; else { var titleNode = transcoder.FoundDocument.GetElementsByTagName("title").First(); if (titleNode != null) title = titleNode.Value; } var imgUrl = ""; var imgNode = transcoder.FoundDocument.GetElementsByTagName("meta").Where(e => e.GetAttributeValue("property", "") == "og:image").First();//doc.SelectSingleNode("//meta[@property='og:image']"); if (imgNode != null) imgUrl = imgNode.GetAttributeValue("content", ""); var mainText = ""; if (transcoder.FoundContentElement != null) { mainText = transcoder.FoundContentElement.GetInnerHtml(); } return new CleanText { Title = title, Image = imgUrl, Content = mainText, Url = url, FetchDate = DateTime.Now }; } else { return new CleanText { Title = "Content not found", Image = "", Content = "", Url = url, FetchDate = DateTime.Now }; } } catch (Exception ex) { return new CleanText { Title = "Content not found", Image = ex.Message, Content = "", Url = url, FetchDate = DateTime.Now }; } }
public void TestImageSourceTransformer() { Func<AttributeTransformationInput, AttributeTransformationResult> imgSrcTransformer = input => new AttributeTransformationResult { TransformedValue = string.Format("http://imageresizer.com/u={0}", input.AttributeValue), OriginalValueAttributeName = "origsrc", }; string originalSrcValue = "http://example.com/some_image.jpg"; string expectedSrcValue = imgSrcTransformer.Invoke(new AttributeTransformationInput { AttributeValue = originalSrcValue, Element = null }).TransformedValue; string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>"; string htmlContent = "<html><body>" + dummyParagraphs + "<p><img src=\"" + originalSrcValue + "\" /></p>" + dummyParagraphs + "</body></html>"; var nReadabilityTranscoder = new NReadabilityTranscoder { ImageSourceTranformer = imgSrcTransformer, }; bool mainContentExtracted; string transcodedContent = nReadabilityTranscoder.Transcode(htmlContent, "http://immortal.pl/", out mainContentExtracted); Assert.IsTrue(mainContentExtracted); Assert.IsTrue(transcodedContent.Contains("src=\"" + expectedSrcValue + "\"")); Assert.IsTrue(transcodedContent.Contains("origsrc=\"" + originalSrcValue + "\"")); }
public void TestAnchorHrefTransformer() { Func<AttributeTransformationInput, AttributeTransformationResult> anchorHrefTransformer = input => new AttributeTransformationResult { TransformedValue = string.Format("http://redirector.com/u={0}", input.AttributeValue), OriginalValueAttributeName = "orighref", }; string originalHrefValue = "http://example.com/some_article.html"; string expectedHrefValue = anchorHrefTransformer.Invoke(new AttributeTransformationInput { AttributeValue = originalHrefValue, Element = null }).TransformedValue; string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>"; string htmlContent = "<html><body>" + dummyParagraphs + "<p><a href=\"" + originalHrefValue + "\">Some article</a></p>" + dummyParagraphs + "</body></html>"; var nReadabilityTranscoder = new NReadabilityTranscoder { AnchorHrefTranformer = anchorHrefTransformer, }; bool mainContentExtracted; string transcodedContent = nReadabilityTranscoder.Transcode(htmlContent, "http://immortal.pl/", out mainContentExtracted); Assert.IsTrue(mainContentExtracted); Assert.IsTrue(transcodedContent.Contains("href=\"" + expectedHrefValue + "\"")); Assert.IsTrue(transcodedContent.Contains("orighref=\"" + originalHrefValue + "\"")); }
public void SetUp() { _nReadabilityTranscoder = new NReadabilityTranscoder(); }
private static string embed(string url, out string title) { title = url; Console.WriteLine("Embeding " + url); if (url.Contains("youtube.com/")) { string vidid = ""; Regex s = new Regex(@"^.*((youtu.be\/)|(v\/)|(embed\/)|(watch\?))\??v?=?([^#\&\?]*).*"); Match ss = s.Match(url); if (ss.Success) { vidid = ss.Groups[6].Value; return " <iframe width=\"560\" height=\"349\" src=\"http://www.youtube.com/embed/" + vidid + "\" frameborder=\"0\" allowfullscreen></iframe> "; } } string embedlyurl = "http://api.embed.ly/1/oembed?url=" + url; string embedoutput = ""; try { embedoutput = c.DownloadString(embedlyurl); } catch (Exception e) { } Regex html = new Regex(@"""html"": ""(.*?)\"","); Regex urlre = new Regex(@"""url"": ""(.*?)\"",.*?""width"": (.*?),"); Regex type = new Regex(@"""type"": ""(.*?)"""); Regex titles = new Regex(@"""title"": ""(.*?)"""); Match embtype = type.Match(embedoutput); if (embtype.Success) { Match embtitle = titles.Match(embedoutput); if (embtitle.Success) { title = embtitle.Groups[1].Value; } if (embtype.Groups[1].Value == "photo") { Match emburl = urlre.Match(embedoutput); if (emburl.Success) { string width = emburl.Groups[2].Value; int iwidth = 0; int.TryParse(width, out iwidth); if (iwidth > 700) { return "<img src=\"" + emburl.Groups[1].Value + "\" width=\"700\"/>"; } else { return "<img src=\"" + emburl.Groups[1].Value + "\"/>"; } } } else { Match embhtml = html.Match(embedoutput); if (embhtml.Success) { return embhtml.Groups[1].Value; } } } try { // string realhtml = c.DownloadString(url); //NReadabilityTranscoder rd = new NReadabilityTranscoder(); NReadabilityWebTranscoder rdw = new NReadabilityWebTranscoder(); bool extracted = false; // string transcoded = rd.Transcode(realhtml, out extracted); string transcoded = rdw.Transcode(url, out extracted); if (extracted) { Regex body = new Regex("<body>(.*?)</body>", RegexOptions.Singleline); Match bodym = body.Match(transcoded); if (bodym.Success) { transcoded = "<html><body>" + bodym.Groups[1].Value + "</body></html>"; } Regex header = new Regex("(<h1>.*?</h1>)", RegexOptions.Singleline); transcoded = header.Replace(transcoded, ""); string realhtml = c.DownloadString(url); Regex regexs = new Regex("<title>(.*?)</title>", RegexOptions.IgnoreCase); Match match = regexs.Match(realhtml); if (match.Success) { title = match.Groups[1].Value; } return transcoded; } } catch (Exception e) { try { string realhtml = c.DownloadString(url); Regex regexs = new Regex(".*<head>.*<title>(.*)</title>.*</head>.*", RegexOptions.IgnoreCase); Match match = regexs.Match(realhtml); if (match.Success) { title = match.Groups[0].Value; } realhtml = SanitizeXmlString(realhtml); bool extracted = false; NReadabilityTranscoder rd = new NReadabilityTranscoder(); //NReadabilityWebTranscoder rdw = new NReadabilityWebTranscoder(); // string transcoded = rd.Transcode(realhtml, out extracted); string transcoded = rd.Transcode(realhtml, url, out extracted); if (extracted) { Regex body = new Regex("<body>(.*?)</body>", RegexOptions.Singleline); Match bodym = body.Match(transcoded); if (bodym.Success) { transcoded = "<html><body>" + bodym.Groups[1].Value + "</body></html>"; } Regex header = new Regex("(<h1>.*?</h1>)", RegexOptions.Singleline); transcoded = header.Replace(transcoded, ""); return transcoded; } } catch (Exception ex) { try { string realhtml = SanitizeXmlString(c.DownloadString(url)); Regex regexs = new Regex(".*<head>.*<title>(.*)</title>.*</head>.*", RegexOptions.IgnoreCase); Match match = regexs.Match(realhtml); if (match.Success) { title = match.Groups[0].Value; } bool extracted = false; using (Document doc = Document.FromString(realhtml)) { doc.ShowWarnings = false; doc.Quiet = true; doc.OutputXhtml = true; doc.CleanAndRepair(); realhtml = doc.Save(); } NReadabilityTranscoder rd = new NReadabilityTranscoder(); //NReadabilityWebTranscoder rdw = new NReadabilityWebTranscoder(); // string transcoded = rd.Transcode(realhtml, out extracted); string transcoded = rd.Transcode(realhtml, url, out extracted); if (extracted) { Regex body = new Regex("<body>(.*?)</body>", RegexOptions.Singleline); Match bodym = body.Match(transcoded); if (bodym.Success) { transcoded = "<html><body>" + bodym.Groups[1].Value + "</body></html>"; } Regex header = new Regex("(<h1>.*?</h1>)", RegexOptions.Singleline); transcoded = header.Replace(transcoded, ""); return transcoded; } } catch (Exception exx) { } } } return ""; }
/// <summary> /// Initializes a new instance of NReadabilityWebTranscoder. /// Allows passing in custom-constructed NReadabilityTranscoder. /// </summary> /// <param name="transcoder">A NReadailityTranscoder.</param> public NReadabilityWebTranscoder(NReadabilityTranscoder transcoder) : this(transcoder, new UrlFetcher()) { }
/// <summary> /// Initializes a new instance of NReadabilityWebTranscoder. /// Allows passing in custom-constructed NReadabilityTranscoder, /// and a custom IUrlFetcher. /// </summary> /// <param name="transcoder">A NReadabilityTranscoder.</param> /// <param name="urlFetcher">IFetcher instance to download content.</param> public NReadabilityWebTranscoder(NReadabilityTranscoder transcoder, IUrlFetcher urlFetcher) : this(transcoder, urlFetcher, _DefaultPageSeparatorBuilder) { }
private async void OnUrlChangedImp(DependencyPropertyChangedEventArgs e) { var u = (string)e.NewValue; if (string.IsNullOrEmpty(u)) return; UiInvoke(() => ReasonPhrase = Properties.Resources.DownloadinPrompt); var fp = await HttpDownloadToLocalFile.DownloadAsync(u, "html", ".htm","text/html", 1*1024* 1024); if(string.IsNullOrEmpty(fp) || !File.Exists(fp)) { UiInvoke(() => ReasonPhrase = Properties.Resources.DownloadFailedPrompt); return; } var fdocn = fp + ".xaml"; if(!File.Exists(fdocn)) { var charset = File.ReadAllText(fp + ".enc"); if(string.IsNullOrEmpty(charset)) { charset = DetectEncoding(fp); }//zh-cn 不能被getencoding识别 if("zh-cn".Equals(charset,StringComparison.InvariantCultureIgnoreCase)) { charset = "gb2312"; } var enc = string.IsNullOrEmpty(charset) ? Encoding.GetEncoding(936) : Encoding.GetEncoding(charset); UiInvoke(()=>ReasonPhrase = Properties.Resources.FormattingPrompt); var result = new NReadabilityTranscoder().Transcode(new TranscodingInput(File.ReadAllText(fp,enc)) { Url = u, BackupFilePath = fp, }); if(result.ContentExtracted) { File.WriteAllText(fdocn,result.ExtractedContent); UiInvoke(()=>ReasonPhrase = Properties.Resources.ReadyPrompt); }else { UiInvoke(()=>ReasonPhrase = Properties.Resources.ConvertingFailedPrompt); return; } } UiInvoke(() => { var fdoc = (FlowDocument)XamlReader.Load(File.OpenRead(fdocn)); if (fdoc == null) { ReasonPhrase = Properties.Resources.ConvertingFailedPrompt; return; } _container.Children.Clear(); var fv = new FlowDocumentScrollViewer { Document = fdoc }; _container.Children.Add(fv); }); }