public void Download(string url, string targetDir, string targetFname)
        {
            string fname = targetFname.RemoveColon();

            string filepath = Path.Combine(targetDir, fname);

            //ensure it respects mppl
            filepath = Utilities.TrimPathPart(filepath, _futureleanCourse.Max_path_part_len);

            WebHeaderCollection responseHeaders = _futureleanCourse._client.ResponseHeaders;
            int contentLength = GetContentLength(responseHeaders);
            bool isFileNeeded = IsFileNeeded(filepath, contentLength, fname);

            if (isFileNeeded)
            {

                if (Path.GetExtension(filepath) == ".html")
                {
                    string content = _futureleanCourse._client.DownloadString(url);
                    NReadabilityTranscoder transcoder = new NReadabilityTranscoder();
                    TranscodingInput tiInput = new TranscodingInput(content);
                    TranscodingResult transcodedContent = transcoder.Transcode(tiInput);
                        //.Transcode(content, out success);
                    File.WriteAllText(filepath, transcodedContent.ExtractedContent);
                }
                else
                {
                    _futureleanCourse._client.DownloadFile(url, filepath);
                }
            }
        }
Example #2
0
 private static String GetWebpageContents(String url)
 {
     var nreadabilityTranscoder = new NReadabilityTranscoder();
     using (var wc = new WebClient())
     {
         var rawHtml = wc.DownloadString(url);
         var transcodingInput = new TranscodingInput(rawHtml);
         var extractedHtml = nreadabilityTranscoder.Transcode(transcodingInput).ExtractedContent;
         var pageHtml = new HtmlDocument();
         pageHtml.LoadHtml(extractedHtml);
         return pageHtml.DocumentNode.SelectSingleNode("//body").InnerText;
     }
 }
        /// <summary>
        /// Extracts article content from an HTML page.
        /// </summary>
        /// <param name="transcodingInput">An object containing input parameters, i.a. html content to be processed.</param>
        /// <returns>An object containing transcoding result, i.a. extracted content and title.</returns>
        public TranscodingResult Transcode(TranscodingInput transcodingInput)
        {
            if (transcodingInput == null)
              {
            throw new ArgumentNullException("transcodingInput");
              }

              bool contentExtracted;
              string extractedTitle;
              string nextPageUrl;

              XDocument transcodedXmlDocument =
            TranscodeToXml(
              transcodingInput.HtmlContent,
              transcodingInput.Url,
              out contentExtracted,
              out extractedTitle,
              out nextPageUrl);

              string transcodedContent =
            _sgmlDomSerializer.SerializeDocument(
              transcodedXmlDocument,
              transcodingInput.DomSerializationParams);

              bool titleExtracted = !string.IsNullOrEmpty(extractedTitle);

              return
            new TranscodingResult(contentExtracted, titleExtracted)
              {
            ExtractedContent = transcodedContent,
            ExtractedTitle = extractedTitle,
            NextPageUrl = nextPageUrl,
              };
        }
        /// <summary>
        /// Extracts article content from an HTML page.
        /// </summary>
        /// <param name="transcodingInput">An object containing input parameters, i.a. html content to be processed.</param>
        /// <returns>An object containing transcoding result, i.a. extracted content and title.</returns>
        public TranscodingResult Transcode(TranscodingInput transcodingInput)
        {
            if (transcodingInput == null)
            {
                throw new ArgumentNullException("transcodingInput");
            }

            //在解析html前先去掉多余的空格和不规则的nbsp;<br>
            var content = _BreakBeforeParagraphRegex.Replace(transcodingInput.HtmlContent, "<p");
            content = _KillBreaksRegex.Replace(content, "<br />");

            bool contentExtracted;
            string extractedTitle;
            string nextPageUrl;

            XDocument transcodedXmlDocument =
                TranscodeToXml(
                    content,
                    transcodingInput.Url,
                    out contentExtracted,
                    out extractedTitle,
                    out nextPageUrl, transcodingInput.BackupFilePath);

            string transcodedContent =
                _sgmlDomSerializer.SerializeDocument(
                    transcodedXmlDocument,
                    transcodingInput.DomSerializationParams);

            bool titleExtracted = !string.IsNullOrEmpty(extractedTitle);

            return
                new TranscodingResult(contentExtracted, titleExtracted)
                {
                    ExtractedContent = transcodedContent,
                    ExtractedTitle = extractedTitle,
                    NextPageUrl = nextPageUrl,
                };
        }