/// <summary>
 ///  Initializes a new instance of NReadabilityWebTranscoder.
 ///  Allows passing in custom-constructed NReadabilityTranscoder,
 ///  and a custom IUrlFetcher.
 /// </summary>
 /// <param name="transcoder">A NReadabilityTranscoder.</param>
 /// <param name="urlFetcher">IFetcher instance to download content.</param>
 /// <param name="pageSeparatorBuilder">A function that creates a HTML fragment for page separator. It takes the page number as an argument.</param>
 public NReadabilityWebTranscoder(NReadabilityTranscoder transcoder, IUrlFetcher urlFetcher, Func<int, string> pageSeparatorBuilder)
 {
     _transcoder = transcoder;
       _urlFetcher = urlFetcher;
       _sgmlDomSerializer = new SgmlDomSerializer();
       _pageSeparatorBuilder = pageSeparatorBuilder;
 }
 /// <summary>
 ///  Initializes a new instance of NReadabilityWebTranscoder.
 ///  Allows passing in custom-constructed NReadabilityTranscoder,
 ///  and a custom IUrlFetcher.  This overload is mostly used for testing.
 /// </summary>
 /// <param name="transcoder">A NReadabilityTranscoder.</param>
 /// <param name="urlFetcher">IFetcher instance to download content.</param>
 public NReadabilityWebTranscoder(NReadabilityTranscoder transcoder, IUrlFetcher urlFetcher)
 {
     _transcoder = transcoder;
       _urlFetcher = urlFetcher;
       _sgmlDomSerializer = new SgmlDomSerializer();
       _pageSeparatorBuilder = _DefaultPageSeparatorBuilder;
 }
        public void Download(string url, string targetDir, string targetFname)
        {
            string fname = targetFname.RemoveColon();

            string filepath = Path.Combine(targetDir, fname);

            //ensure it respects mppl
            filepath = Utilities.TrimPathPart(filepath, _futureleanCourse.Max_path_part_len);

            WebHeaderCollection responseHeaders = _futureleanCourse._client.ResponseHeaders;
            int contentLength = GetContentLength(responseHeaders);
            bool isFileNeeded = IsFileNeeded(filepath, contentLength, fname);

            if (isFileNeeded)
            {

                if (Path.GetExtension(filepath) == ".html")
                {
                    string content = _futureleanCourse._client.DownloadString(url);
                    NReadabilityTranscoder transcoder = new NReadabilityTranscoder();
                    TranscodingInput tiInput = new TranscodingInput(content);
                    TranscodingResult transcodedContent = transcoder.Transcode(tiInput);
                        //.Transcode(content, out success);
                    File.WriteAllText(filepath, transcodedContent.ExtractedContent);
                }
                else
                {
                    _futureleanCourse._client.DownloadFile(url, filepath);
                }
            }
        }
Exemplo n.º 4
0
 public string GetArticle(string filepath)
 {
     FileStream file =File.OpenRead(filepath);
     StreamReader reader = new StreamReader(file);
     string Html = reader.ReadToEnd();
     reader.Close();
     file.Close();
     bool success = false;
     NReadabilityTranscoder util = new NReadabilityTranscoder();
     string Article = util.Transcode(Html, out success);
     return Article;
 }
Exemplo n.º 5
0
 private static String GetWebpageContents(String url)
 {
     var nreadabilityTranscoder = new NReadabilityTranscoder();
     using (var wc = new WebClient())
     {
         var rawHtml = wc.DownloadString(url);
         var transcodingInput = new TranscodingInput(rawHtml);
         var extractedHtml = nreadabilityTranscoder.Transcode(transcodingInput).ExtractedContent;
         var pageHtml = new HtmlDocument();
         pageHtml.LoadHtml(extractedHtml);
         return pageHtml.DocumentNode.SelectSingleNode("//body").InnerText;
     }
 }
Exemplo n.º 6
0
        public static string getTitle(string url)
        {
            bool mainContentExtracted;

            //結果
            string source = "";
            string title = "";

            //トランスコーダー
            NReadabilityTranscoder nReadabilityTranscoder = new NReadabilityTranscoder();
            //パーサー
            HtmlParser hp = new HtmlParser();

            //仮想ブラウザ
            NonDispBrowser nb = new NonDispBrowser();
            //HTMLの取得
            source = hp.getHtmlSource(url);

            try
            {
                nb.NavigateAndWaitFromSource(hp.getHtmlPlainTextFromSourceWB(nReadabilityTranscoder.Transcode(source, out mainContentExtracted)));
                title = nb.Document.Title;
            }
            catch
            {

            }
            finally
            {
                //確実に破棄
                nb.Dispose();
            }

            //結果を返す
            return title;

            //return hp.getHtmlPlainTextFromSource(nReadabilityTranscoder.Transcode(getHtmlSource(url), out mainContentExtracted));
        }
Exemplo n.º 7
0
        public static string transeForJapa(string url)
        {
            bool mainContentExtracted;

            //結果
            string result = "";
            string source = "";
            string title = "";

            //トランスコーダー
            NReadabilityTranscoder nReadabilityTranscoder = new NReadabilityTranscoder();
            //パーサー
            HtmlParser hp = new HtmlParser();

            //仮想ブラウザ
            using (NonDispBrowser nb = new NonDispBrowser())
            {
                //HTMLの取得
                source = hp.getHtmlSource(url);

                try
                {
                    //まずは要約データからボディの取得を試みる
                    nb.NavigateAndWaitFromSource(hp.getHtmlPlainTextFromSourceWB(nReadabilityTranscoder.Transcode(source, out mainContentExtracted)));
                    title = nb.Document.Title;
                    result = nb.Document.Body.InnerText.Replace(title, "");

                    if (result != "") { return result; }

                    result = HtmlParser.htmlGomiRegularRemove(HtmlParser.htmlTagRegularRemove(source));
                }
                catch
                {

                }
            }

            //結果を返す
            return result;

            //return hp.getHtmlPlainTextFromSource(nReadabilityTranscoder.Transcode(getHtmlSource(url), out mainContentExtracted));
        }
Exemplo n.º 8
0
        private CleanText getCleanText(string url, string content)
        {
            var transcoder = new NReadabilityTranscoder();
            bool success;
            try
            {
                //transcoder.Ti
                TranscodingResult textRes = transcoder.Transcode(new TranscodingInput(content));

                if (textRes.ContentExtracted)
                {
                    var title = "";
                    if (textRes.TitleExtracted)
                        title = textRes.ExtractedTitle;
                    else
                    {
                        var titleNode = transcoder.FoundDocument.GetElementsByTagName("title").First();
                        if (titleNode != null)
                            title = titleNode.Value;
                    }
                    var imgUrl = "";
                    var imgNode = transcoder.FoundDocument.GetElementsByTagName("meta").Where(e => e.GetAttributeValue("property", "") == "og:image").First();//doc.SelectSingleNode("//meta[@property='og:image']");
                    if (imgNode != null)
                        imgUrl = imgNode.GetAttributeValue("content", "");

                    var mainText = "";
                    if (transcoder.FoundContentElement != null)
                    {
                        mainText = transcoder.FoundContentElement.GetInnerHtml();
                    }

                    return new CleanText { Title = title, Image = imgUrl, Content = mainText, Url = url, FetchDate = DateTime.Now };
                }
                else
                {
                    return new CleanText { Title = "Content not found", Image = "", Content = "", Url = url, FetchDate = DateTime.Now };
                }
            }
            catch (Exception ex)
            {
                return new CleanText { Title = "Content not found", Image = ex.Message, Content = "", Url = url, FetchDate = DateTime.Now };
            }
        }
        public void TestImageSourceTransformer()
        {
            Func<AttributeTransformationInput, AttributeTransformationResult> imgSrcTransformer =
            input =>
            new AttributeTransformationResult
              {
            TransformedValue = string.Format("http://imageresizer.com/u={0}", input.AttributeValue),
            OriginalValueAttributeName = "origsrc",
              };

              string originalSrcValue = "http://example.com/some_image.jpg";
              string expectedSrcValue = imgSrcTransformer.Invoke(new AttributeTransformationInput { AttributeValue = originalSrcValue, Element = null }).TransformedValue;

              string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>";
              string htmlContent = "<html><body>" + dummyParagraphs + "<p><img src=\"" + originalSrcValue + "\" /></p>" + dummyParagraphs + "</body></html>";

              var nReadabilityTranscoder =
            new NReadabilityTranscoder
              {
            ImageSourceTranformer = imgSrcTransformer,
              };

              bool mainContentExtracted;
              string transcodedContent = nReadabilityTranscoder.Transcode(htmlContent, "http://immortal.pl/", out mainContentExtracted);

              Assert.IsTrue(mainContentExtracted);
              Assert.IsTrue(transcodedContent.Contains("src=\"" + expectedSrcValue + "\""));
              Assert.IsTrue(transcodedContent.Contains("origsrc=\"" + originalSrcValue + "\""));
        }
        public void TestAnchorHrefTransformer()
        {
            Func<AttributeTransformationInput, AttributeTransformationResult> anchorHrefTransformer =
            input =>
            new AttributeTransformationResult
              {
            TransformedValue = string.Format("http://redirector.com/u={0}", input.AttributeValue),
            OriginalValueAttributeName = "orighref",
              };

              string originalHrefValue = "http://example.com/some_article.html";
              string expectedHrefValue = anchorHrefTransformer.Invoke(new AttributeTransformationInput { AttributeValue = originalHrefValue, Element = null }).TransformedValue;

              string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>";
              string htmlContent = "<html><body>" + dummyParagraphs + "<p><a href=\"" + originalHrefValue + "\">Some article</a></p>" + dummyParagraphs + "</body></html>";

              var nReadabilityTranscoder =
            new NReadabilityTranscoder
              {
            AnchorHrefTranformer = anchorHrefTransformer,
              };

              bool mainContentExtracted;
              string transcodedContent = nReadabilityTranscoder.Transcode(htmlContent, "http://immortal.pl/", out mainContentExtracted);

              Assert.IsTrue(mainContentExtracted);
              Assert.IsTrue(transcodedContent.Contains("href=\"" + expectedHrefValue + "\""));
              Assert.IsTrue(transcodedContent.Contains("orighref=\"" + originalHrefValue + "\""));
        }
 public void SetUp()
 {
     _nReadabilityTranscoder = new NReadabilityTranscoder();
 }
Exemplo n.º 12
0
        private static string embed(string url, out string title)
        {
            title = url;
            Console.WriteLine("Embeding " + url);
            if (url.Contains("youtube.com/"))
            {
                string vidid = "";
                Regex s = new Regex(@"^.*((youtu.be\/)|(v\/)|(embed\/)|(watch\?))\??v?=?([^#\&\?]*).*");
                Match ss = s.Match(url);
                if (ss.Success)
                {
                    vidid = ss.Groups[6].Value;
                    return " <iframe width=\"560\" height=\"349\" src=\"http://www.youtube.com/embed/" + vidid + "\" frameborder=\"0\" allowfullscreen></iframe> ";

                }
            }

            string embedlyurl = "http://api.embed.ly/1/oembed?url=" + url;
            string embedoutput = "";
            try
            {
                embedoutput = c.DownloadString(embedlyurl);
            }
            catch (Exception e)
            {

            }
            Regex html = new Regex(@"""html"": ""(.*?)\"",");
            Regex urlre = new Regex(@"""url"": ""(.*?)\"",.*?""width"": (.*?),");
            Regex type = new Regex(@"""type"": ""(.*?)""");
            Regex titles = new Regex(@"""title"": ""(.*?)""");
            Match embtype = type.Match(embedoutput);
            if (embtype.Success)
            {
                Match embtitle = titles.Match(embedoutput);
                if (embtitle.Success)
                {
                    title = embtitle.Groups[1].Value;
                }

                if (embtype.Groups[1].Value == "photo")
                {
                    Match emburl = urlre.Match(embedoutput);
                    if (emburl.Success)
                    {
                        string width = emburl.Groups[2].Value;
                        int iwidth = 0;
                        int.TryParse(width, out iwidth);
                        if (iwidth > 700)
                        {
                            return "<img src=\"" + emburl.Groups[1].Value + "\" width=\"700\"/>";
                        }
                        else
                        {
                            return "<img src=\"" + emburl.Groups[1].Value + "\"/>";
                        }
                    }
                }
                else
                {
                    Match embhtml = html.Match(embedoutput);
                    if (embhtml.Success)
                    {
                        return embhtml.Groups[1].Value;
                    }
                }

            }
            try
            {

                // string realhtml = c.DownloadString(url);
                //NReadabilityTranscoder rd = new NReadabilityTranscoder();
                NReadabilityWebTranscoder rdw = new NReadabilityWebTranscoder();

                bool extracted = false;

                // string transcoded = rd.Transcode(realhtml, out extracted);
                string transcoded = rdw.Transcode(url, out extracted);
                if (extracted)
                {
                    Regex body = new Regex("<body>(.*?)</body>", RegexOptions.Singleline);
                    Match bodym = body.Match(transcoded);
                    if (bodym.Success)
                    {
                        transcoded = "<html><body>" + bodym.Groups[1].Value + "</body></html>";
                    }
                    Regex header = new Regex("(<h1>.*?</h1>)", RegexOptions.Singleline);
                    transcoded = header.Replace(transcoded, "");
                    string realhtml = c.DownloadString(url);
                    Regex regexs = new Regex("<title>(.*?)</title>",
                        RegexOptions.IgnoreCase);
                    Match match = regexs.Match(realhtml);
                    if (match.Success)
                    {
                        title = match.Groups[1].Value;
                    }
                    return transcoded;
                }
            }
            catch (Exception e)
            {
                try
                {
                    string realhtml = c.DownloadString(url);
                    Regex regexs = new Regex(".*<head>.*<title>(.*)</title>.*</head>.*",
                        RegexOptions.IgnoreCase);
                    Match match = regexs.Match(realhtml);
                    if (match.Success)
                    {
                        title = match.Groups[0].Value;
                    }
                    realhtml = SanitizeXmlString(realhtml);
                    bool extracted = false;

                    NReadabilityTranscoder rd = new NReadabilityTranscoder();
                    //NReadabilityWebTranscoder rdw = new NReadabilityWebTranscoder();

                    // string transcoded = rd.Transcode(realhtml, out extracted);
                    string transcoded = rd.Transcode(realhtml, url, out extracted);
                    if (extracted)
                    {
                        Regex body = new Regex("<body>(.*?)</body>", RegexOptions.Singleline);
                        Match bodym = body.Match(transcoded);
                        if (bodym.Success)
                        {
                            transcoded = "<html><body>" + bodym.Groups[1].Value + "</body></html>";
                        }
                        Regex header = new Regex("(<h1>.*?</h1>)", RegexOptions.Singleline);
                        transcoded = header.Replace(transcoded, "");
                        return transcoded;
                    }
                }
                catch (Exception ex)
                {
                    try
                    {
                        string realhtml = SanitizeXmlString(c.DownloadString(url));
                        Regex regexs = new Regex(".*<head>.*<title>(.*)</title>.*</head>.*",
                          RegexOptions.IgnoreCase);
                        Match match = regexs.Match(realhtml);
                        if (match.Success)
                        {
                            title = match.Groups[0].Value;
                        }
                        bool extracted = false;
                        using (Document doc = Document.FromString(realhtml))
                        {
                            doc.ShowWarnings = false;
                            doc.Quiet = true;
                            doc.OutputXhtml = true;
                            doc.CleanAndRepair();
                            realhtml = doc.Save();
                        }
                        NReadabilityTranscoder rd = new NReadabilityTranscoder();
                        //NReadabilityWebTranscoder rdw = new NReadabilityWebTranscoder();

                        // string transcoded = rd.Transcode(realhtml, out extracted);
                        string transcoded = rd.Transcode(realhtml, url, out extracted);
                        if (extracted)
                        {
                            Regex body = new Regex("<body>(.*?)</body>", RegexOptions.Singleline);
                            Match bodym = body.Match(transcoded);
                            if (bodym.Success)
                            {
                                transcoded = "<html><body>" + bodym.Groups[1].Value + "</body></html>";
                            }
                            Regex header = new Regex("(<h1>.*?</h1>)", RegexOptions.Singleline);
                            transcoded = header.Replace(transcoded, "");
                            return transcoded;
                        }

                    }
                    catch (Exception exx)
                    {

                    }

                }
            }
            return "";
        }
 /// <summary>
 /// Initializes a new instance of NReadabilityWebTranscoder.
 /// Allows passing in custom-constructed NReadabilityTranscoder.
 /// </summary>
 /// <param name="transcoder">A NReadailityTranscoder.</param>
 public NReadabilityWebTranscoder(NReadabilityTranscoder transcoder)
     : this(transcoder, new UrlFetcher())
 {
 }
 /// <summary>
 ///  Initializes a new instance of NReadabilityWebTranscoder.
 ///  Allows passing in custom-constructed NReadabilityTranscoder,
 ///  and a custom IUrlFetcher.
 /// </summary>
 /// <param name="transcoder">A NReadabilityTranscoder.</param>
 /// <param name="urlFetcher">IFetcher instance to download content.</param>
 public NReadabilityWebTranscoder(NReadabilityTranscoder transcoder, IUrlFetcher urlFetcher)
     : this(transcoder, urlFetcher, _DefaultPageSeparatorBuilder)
 {
 }
        private async void OnUrlChangedImp(DependencyPropertyChangedEventArgs e)
        {
            var u = (string)e.NewValue;
            if (string.IsNullOrEmpty(u))
                return;
            UiInvoke(() => ReasonPhrase = Properties.Resources.DownloadinPrompt);
            var fp = await HttpDownloadToLocalFile.DownloadAsync(u, "html", ".htm","text/html", 1*1024* 1024);
            if(string.IsNullOrEmpty(fp) || !File.Exists(fp))
            {
                UiInvoke(() => ReasonPhrase = Properties.Resources.DownloadFailedPrompt);
                return;
            }
            var fdocn = fp + ".xaml";
            if(!File.Exists(fdocn))
            {
                var charset = File.ReadAllText(fp + ".enc");
                if(string.IsNullOrEmpty(charset))
                {
                    charset = DetectEncoding(fp);
                }//zh-cn 不能被getencoding识别
                if("zh-cn".Equals(charset,StringComparison.InvariantCultureIgnoreCase))
                {
                    charset = "gb2312";
                }
                var enc = string.IsNullOrEmpty(charset) ? Encoding.GetEncoding(936) : Encoding.GetEncoding(charset);

                UiInvoke(()=>ReasonPhrase = Properties.Resources.FormattingPrompt);
                var result = new NReadabilityTranscoder().Transcode(new TranscodingInput(File.ReadAllText(fp,enc))
                {
                    Url = u,
                    BackupFilePath = fp,
                });
                if(result.ContentExtracted)
                {
                    File.WriteAllText(fdocn,result.ExtractedContent);
                    UiInvoke(()=>ReasonPhrase = Properties.Resources.ReadyPrompt);
                }else
                {
                    UiInvoke(()=>ReasonPhrase = Properties.Resources.ConvertingFailedPrompt);
                    return;
                }

            }
            UiInvoke(() =>
            {
                var fdoc = (FlowDocument)XamlReader.Load(File.OpenRead(fdocn));
                if (fdoc == null)
                {
                    ReasonPhrase = Properties.Resources.ConvertingFailedPrompt;
                    return;
                }
                _container.Children.Clear();
                var fv = new FlowDocumentScrollViewer { Document = fdoc };
                _container.Children.Add(fv);
            });
           
        }