public void Download(string url, string targetDir, string targetFname) { string fname = targetFname.RemoveColon(); string filepath = Path.Combine(targetDir, fname); //ensure it respects mppl filepath = Utilities.TrimPathPart(filepath, _futureleanCourse.Max_path_part_len); WebHeaderCollection responseHeaders = _futureleanCourse._client.ResponseHeaders; int contentLength = GetContentLength(responseHeaders); bool isFileNeeded = IsFileNeeded(filepath, contentLength, fname); if (isFileNeeded) { if (Path.GetExtension(filepath) == ".html") { string content = _futureleanCourse._client.DownloadString(url); NReadabilityTranscoder transcoder = new NReadabilityTranscoder(); TranscodingInput tiInput = new TranscodingInput(content); TranscodingResult transcodedContent = transcoder.Transcode(tiInput); //.Transcode(content, out success); File.WriteAllText(filepath, transcodedContent.ExtractedContent); } else { _futureleanCourse._client.DownloadFile(url, filepath); } } }
private static String GetWebpageContents(String url) { var nreadabilityTranscoder = new NReadabilityTranscoder(); using (var wc = new WebClient()) { var rawHtml = wc.DownloadString(url); var transcodingInput = new TranscodingInput(rawHtml); var extractedHtml = nreadabilityTranscoder.Transcode(transcodingInput).ExtractedContent; var pageHtml = new HtmlDocument(); pageHtml.LoadHtml(extractedHtml); return pageHtml.DocumentNode.SelectSingleNode("//body").InnerText; } }
/// <summary> /// Extracts article content from an HTML page. /// </summary> /// <param name="transcodingInput">An object containing input parameters, i.a. html content to be processed.</param> /// <returns>An object containing transcoding result, i.a. extracted content and title.</returns> public TranscodingResult Transcode(TranscodingInput transcodingInput) { if (transcodingInput == null) { throw new ArgumentNullException("transcodingInput"); } bool contentExtracted; string extractedTitle; string nextPageUrl; XDocument transcodedXmlDocument = TranscodeToXml( transcodingInput.HtmlContent, transcodingInput.Url, out contentExtracted, out extractedTitle, out nextPageUrl); string transcodedContent = _sgmlDomSerializer.SerializeDocument( transcodedXmlDocument, transcodingInput.DomSerializationParams); bool titleExtracted = !string.IsNullOrEmpty(extractedTitle); return new TranscodingResult(contentExtracted, titleExtracted) { ExtractedContent = transcodedContent, ExtractedTitle = extractedTitle, NextPageUrl = nextPageUrl, }; }
/// <summary> /// Extracts article content from an HTML page. /// </summary> /// <param name="transcodingInput">An object containing input parameters, i.a. html content to be processed.</param> /// <returns>An object containing transcoding result, i.a. extracted content and title.</returns> public TranscodingResult Transcode(TranscodingInput transcodingInput) { if (transcodingInput == null) { throw new ArgumentNullException("transcodingInput"); } //在解析html前先去掉多余的空格和不规则的nbsp;<br> var content = _BreakBeforeParagraphRegex.Replace(transcodingInput.HtmlContent, "<p"); content = _KillBreaksRegex.Replace(content, "<br />"); bool contentExtracted; string extractedTitle; string nextPageUrl; XDocument transcodedXmlDocument = TranscodeToXml( content, transcodingInput.Url, out contentExtracted, out extractedTitle, out nextPageUrl, transcodingInput.BackupFilePath); string transcodedContent = _sgmlDomSerializer.SerializeDocument( transcodedXmlDocument, transcodingInput.DomSerializationParams); bool titleExtracted = !string.IsNullOrEmpty(extractedTitle); return new TranscodingResult(contentExtracted, titleExtracted) { ExtractedContent = transcodedContent, ExtractedTitle = extractedTitle, NextPageUrl = nextPageUrl, }; }