public void Download(string url, string targetDir, string targetFname) { string fname = targetFname.RemoveColon(); string filepath = Path.Combine(targetDir, fname); //ensure it respects mppl filepath = Utilities.TrimPathPart(filepath, _futureleanCourse.Max_path_part_len); WebHeaderCollection responseHeaders = _futureleanCourse._client.ResponseHeaders; int contentLength = GetContentLength(responseHeaders); bool isFileNeeded = IsFileNeeded(filepath, contentLength, fname); if (isFileNeeded) { if (Path.GetExtension(filepath) == ".html") { string content = _futureleanCourse._client.DownloadString(url); NReadabilityTranscoder transcoder = new NReadabilityTranscoder(); TranscodingInput tiInput = new TranscodingInput(content); TranscodingResult transcodedContent = transcoder.Transcode(tiInput); //.Transcode(content, out success); File.WriteAllText(filepath, transcodedContent.ExtractedContent); } else { _futureleanCourse._client.DownloadFile(url, filepath); } } }
public void TestImageSourceTransformer() { Func <AttributeTransformationInput, AttributeTransformationResult> imgSrcTransformer = input => new AttributeTransformationResult { TransformedValue = string.Format("http://imageresizer.com/u={0}", input.AttributeValue), OriginalValueAttributeName = "origsrc", }; string originalSrcValue = "http://example.com/some_image.jpg"; string expectedSrcValue = imgSrcTransformer.Invoke(new AttributeTransformationInput { AttributeValue = originalSrcValue, Element = null }).TransformedValue; string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>"; string htmlContent = "<html><body>" + dummyParagraphs + "<p><img src=\"" + originalSrcValue + "\" /></p>" + dummyParagraphs + "</body></html>"; var nReadabilityTranscoder = new NReadabilityTranscoder { ImageSourceTranformer = imgSrcTransformer, }; bool mainContentExtracted; string transcodedContent = nReadabilityTranscoder.Transcode(htmlContent, "http://immortal.pl/", out mainContentExtracted); Assert.IsTrue(mainContentExtracted); Assert.IsTrue(transcodedContent.Contains("src=\"" + expectedSrcValue + "\"")); Assert.IsTrue(transcodedContent.Contains("origsrc=\"" + originalSrcValue + "\"")); }
public void TestAnchorHrefTransformer() { Func <AttributeTransformationInput, AttributeTransformationResult> anchorHrefTransformer = input => new AttributeTransformationResult { TransformedValue = string.Format("http://redirector.com/u={0}", input.AttributeValue), OriginalValueAttributeName = "orighref", }; string originalHrefValue = "http://example.com/some_article.html"; string expectedHrefValue = anchorHrefTransformer.Invoke(new AttributeTransformationInput { AttributeValue = originalHrefValue, Element = null }).TransformedValue; string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>"; string htmlContent = "<html><body>" + dummyParagraphs + "<p><a href=\"" + originalHrefValue + "\">Some article</a></p>" + dummyParagraphs + "</body></html>"; var nReadabilityTranscoder = new NReadabilityTranscoder { AnchorHrefTranformer = anchorHrefTransformer, }; bool mainContentExtracted; string transcodedContent = nReadabilityTranscoder.Transcode(htmlContent, "http://immortal.pl/", out mainContentExtracted); Assert.IsTrue(mainContentExtracted); Assert.IsTrue(transcodedContent.Contains("href=\"" + expectedHrefValue + "\"")); Assert.IsTrue(transcodedContent.Contains("orighref=\"" + originalHrefValue + "\"")); }
/// <summary> /// Extracts the readable information. /// </summary> /// <param name="uri">The URI.</param> /// <param name="textStream">The text stream.</param> /// <param name="options">The options.</param> /// <param name="encoding">The encoding.</param> /// <returns></returns> protected TranscodingResult ExtractReadableInformation( Uri uri, Stream textStream, ReadOptions options, Encoding encoding = null) { // response stream to text textStream.Position = 0; StreamReader streamReader = new StreamReader(textStream, encoding ?? Encoding.UTF8); _rawHTML = streamReader.ReadToEnd(); // set properties for processing TranscodingInput transcodingInput = new TranscodingInput(_rawHTML) { Url = uri.ToString(), DomSerializationParams = new DomSerializationParams() { BodyOnly = !options.HasHeaderTags, NoHeadline = !options.HasHeadline, PrettyPrint = options.PrettyPrint, DontIncludeContentTypeMetaElement = true, DontIncludeMobileSpecificMetaElements = true, DontIncludeDocTypeMetaElement = false, DontIncludeGeneratorMetaElement = true, ReplaceImagesWithPlaceholders = options.ReplaceImagesWithPlaceholders } }; // process/transcode HTML return(_transcoder.Transcode(transcodingInput)); }
private static String GetWebpageContents(String url) { var nreadabilityTranscoder = new NReadabilityTranscoder(); using (var wc = new WebClient()) { var rawHtml = wc.DownloadString(url); var transcodingInput = new TranscodingInput(rawHtml); var extractedHtml = nreadabilityTranscoder.Transcode(transcodingInput).ExtractedContent; var pageHtml = new HtmlDocument(); pageHtml.LoadHtml(extractedHtml); return(pageHtml.DocumentNode.SelectSingleNode("//body").InnerText); } }
private static void Main(string[] args) { if (args == null || args.Length != 2) { DisplayUsage(); Environment.Exit(1); } string inputFile = args[0]; string outputFile = args[1]; var nReadabilityTranscoder = new NReadabilityTranscoder(); File.WriteAllText( outputFile, nReadabilityTranscoder.Transcode(new TranscodingInput(File.ReadAllText(inputFile))).ExtractedContent); }
private static void Main(string[] args) { if (args == null || args.Length != 2) { DisplayUsage(); Environment.Exit(1); } string inputFile = args[0]; string outputFile = args[1]; var nReadabilityTranscoder = new NReadabilityTranscoder(); bool mainContentExtracted; File.WriteAllText( outputFile, nReadabilityTranscoder.Transcode(File.ReadAllText(inputFile), out mainContentExtracted)); }
public async Task <IActionResult> Get([FromQuery] string q, [FromQuery] string e, [FromQuery] string f) { var transcoder = new NReadabilityTranscoder(); string content; if (string.IsNullOrEmpty(q)) { return(NotFound()); } try { using (var wc = new WebClient()) { wc.Encoding = Encoding.UTF8; content = wc.DownloadString(q); } var transcodedContent = transcoder.Transcode(new TranscodingInput(content)); if (string.IsNullOrEmpty(f) || f != "y") { content = transcodedContent.ExtractedContent; } var posHead = content.IndexOf("<head"); if (posHead > 0) { var endHead = content.IndexOf('>', posHead) + 1; content = content.Insert(endHead, string.Format("<base href='{0}' />", q)); } // Fix relative path error if (!string.IsNullOrEmpty(e)) { await SendMailAsync(e, transcodedContent.ExtractedTitle, content, q); } return(Ok(content)); }catch (Exception ex) { return(BadRequest(ex.Message)); } }
private static CleanText getCleanText(string url, string content) { var transcoder = new NReadabilityTranscoder(); bool success; try { //transcoder.Ti TranscodingResult textRes = transcoder.Transcode(new TranscodingInput(content)); if (textRes.ContentExtracted) { var title = ""; if (textRes.TitleExtracted) title = textRes.ExtractedTitle; else { var titleNode = transcoder.FoundDocument.GetElementsByTagName("title").First(); if (titleNode != null) title = titleNode.Value; } var imgUrl = ""; var imgNode = transcoder.FoundDocument.GetElementsByTagName("meta").Where(e => e.GetAttributeValue("property", "") == "og:image").First();//doc.SelectSingleNode("//meta[@property='og:image']"); if (imgNode != null) imgUrl = imgNode.GetAttributeValue("content",""); var mainText = ""; if (transcoder.FoundContentElement != null) { mainText = transcoder.FoundContentElement.GetInnerHtml(); } return new CleanText { Title = title, Image = imgUrl, Content = mainText, Url = url, FetchDate = DateTime.Now }; } else { return new CleanText { Title = "#FAIL#", Image = "", Content = "", Url = url, FetchDate = DateTime.Now }; } } catch (Exception ex) { return new CleanText { Title = "#FAIL#", Image = ex.Message, Content = "", Url = url, FetchDate = DateTime.Now }; } }
// TODO: if time, add test case 7 (the sample is already in the repo but needs fixing) public void TestSampleInputs([Values(1, 2, 3, 4, 5, 6, 8, 9)] int sampleInputNumber) { string sampleInputNumberStr = sampleInputNumber.ToString().PadLeft(2, '0'); string content = File.ReadAllText(string.Format(@"SampleInput\SampleInput_{0}.html", sampleInputNumberStr)); bool mainContentExtracted; string transcodedContent = _nReadabilityTranscoder.Transcode(content, out mainContentExtracted); const string outputDir = "SampleOutput"; if (!Directory.Exists(outputDir)) { Directory.CreateDirectory(outputDir); } File.WriteAllText( Path.Combine(outputDir, string.Format("SampleOutput_{0}.html", sampleInputNumberStr)), transcodedContent, Encoding.UTF8); switch (sampleInputNumber) { case 1: // washingtonpost.com - "Court Puts Off Decision On Indefinite Detention" Assert.IsTrue(transcodedContent.Contains("The Supreme Court yesterday vacated a lower")); Assert.IsTrue(transcodedContent.Contains("The justices did not rule on the merits")); Assert.IsTrue(transcodedContent.Contains("But the government said the issues were now")); break; case 2: // devBlogi.pl - "Po co nam testerzy?" Assert.IsTrue(transcodedContent.Contains("Moja siostra sprawiła swoim dzieciom szczeniaczka")); Assert.IsTrue(transcodedContent.Contains("Z tresowaniem psów jest tak, że reakcja musi być")); Assert.IsTrue(transcodedContent.Contains("Korzystając z okazji, chcielibyśmy dowiedzieć się")); break; case 3: // codinghorror.com - "Welcome Back Comments" Assert.IsTrue(transcodedContent.Contains("I apologize for the scarcity of updates lately.")); Assert.IsTrue(transcodedContent.Contains("Most of all, I blame myself.")); Assert.IsTrue(transcodedContent.Contains("And, most of all, thanks to")); break; case 4: // sample page; only with paragraphs Assert.IsTrue(transcodedContent.Contains("Lorem ipsum dolor sit amet, consectetur adipiscing elit.")); Assert.IsTrue(transcodedContent.Contains("Mauris nec massa ante, id fringilla nisi.")); Assert.IsTrue(transcodedContent.Contains("Nulla facilisi. Proin lacinia venenatis elit, nec ornare elit varius eu.")); Assert.IsTrue(transcodedContent.Contains("Duis vitae ultricies nibh.")); Assert.IsTrue(transcodedContent.Contains("Vestibulum dictum iaculis nisl, lobortis luctus justo porttitor eu.")); break; case 5: // mnmlist.com - "clear distractions" Assert.IsTrue(transcodedContent.Contains("When it comes to minimalism in")); Assert.IsTrue(transcodedContent.Contains("Here’s how:")); Assert.IsTrue(transcodedContent.Contains("Set limits on your work hours. If your time is limited, you’ll find ways to make the most of that limited time.")); break; case 6: // sample page; nbsp Assert.IsTrue(transcodedContent.Contains("1. Item 1.")); // there's a non-breaking space here break; case 7: // http://nplusonemag.com/treasure-island Assert.IsTrue(transcodedContent.Contains("stretched out storylines")); Assert.IsTrue(transcodedContent.Contains("It is no longer a smart social move to brag about not owning a television.")); Assert.IsTrue(transcodedContent.Contains("Of course, some habits can be hard to give up completely.")); break; case 8: // NYTimes leading paragraph Assert.IsTrue(transcodedContent.Contains("freed from house arrest on Saturday, setting her on the path")); Assert.IsTrue(transcodedContent.Contains("confrontation with the generals who had kept her out of the public eye")); Assert.IsTrue(transcodedContent.Contains("Western capitals was one of celebration")); break; case 9: // http://www.udidahan.com/2010/08/31/race-conditions-dont-exist/ - rich sidebar should not be identified as main content Assert.IsTrue(transcodedContent.Contains("Not in the business world anyway.")); Assert.IsTrue(transcodedContent.Contains("we could look at modeling the acceptance")); Assert.IsTrue(transcodedContent.Contains("Keep an eye out.")); break; default: throw new NotSupportedException("Unknown sample input number (" + sampleInputNumber + "). Have you added another sample input? If so, then add appropriate asserts here as well."); } Assert.IsTrue(mainContentExtracted); }
public void TestImageSourceTransformer() { // arrange Func<AttributeTransformationInput, AttributeTransformationResult> imgSrcTransformer = input => new AttributeTransformationResult { TransformedValue = string.Format("http://imageresizer.com/u={0}", input.AttributeValue), OriginalValueAttributeName = "origsrc", }; string originalSrcValue = "http://example.com/some_image.jpg"; string expectedSrcValue = imgSrcTransformer.Invoke(new AttributeTransformationInput { AttributeValue = originalSrcValue, Element = null }).TransformedValue; string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>"; string htmlContent = "<html><body>" + dummyParagraphs + "<p><img src=\"" + originalSrcValue + "\" /></p>" + dummyParagraphs + "</body></html>"; var nReadabilityTranscoder = new NReadabilityTranscoder { ImageSourceTranformer = imgSrcTransformer, }; var transcodingInput = new TranscodingInput(htmlContent) { Url = "http://immortal.pl/", }; // act TranscodingResult transcodingResult = nReadabilityTranscoder.Transcode(transcodingInput); // assert Assert.IsTrue(transcodingResult.ContentExtracted); Assert.IsTrue(transcodingResult.ExtractedContent.Contains("src=\"" + expectedSrcValue + "\"")); Assert.IsTrue(transcodingResult.ExtractedContent.Contains("origsrc=\"" + originalSrcValue + "\"")); }
public void TestAnchorHrefTransformer() { // arrange Func<AttributeTransformationInput, AttributeTransformationResult> anchorHrefTransformer = input => new AttributeTransformationResult { TransformedValue = string.Format("http://redirector.com/u={0}", input.AttributeValue), OriginalValueAttributeName = "orighref", }; string originalHrefValue = "http://example.com/some_article.html"; string expectedHrefValue = anchorHrefTransformer.Invoke(new AttributeTransformationInput { AttributeValue = originalHrefValue, Element = null }).TransformedValue; string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>"; string htmlContent = "<html><body>" + dummyParagraphs + "<p><a href=\"" + originalHrefValue + "\">Some article</a></p>" + dummyParagraphs + "</body></html>"; var nReadabilityTranscoder = new NReadabilityTranscoder { AnchorHrefTranformer = anchorHrefTransformer, }; var transcodingInput = new TranscodingInput(htmlContent) { Url = "http://immortal.pl/", }; // act TranscodingResult transcodingResult = nReadabilityTranscoder.Transcode(transcodingInput); // assert Assert.IsTrue(transcodingResult.ContentExtracted); Assert.IsTrue(transcodingResult.ExtractedContent.Contains("href=\"" + expectedHrefValue + "\"")); Assert.IsTrue(transcodingResult.ExtractedContent.Contains("orighref=\"" + originalHrefValue + "\"")); }
protected override void Handle(Page page) { var elements = page.Selectable.SelectList(Selectors.XPath("//div[@class='result']")).Nodes(); var results = new List <BaiduNews>(); var keyword = page.Request.Extras.Aggregate("", (current, kv) => string.IsNullOrEmpty(current) ? kv.Value : $"{current},{kv.Value}"); foreach (var element in elements) { var title = element.Select(Selectors.XPath("h3[@class='c-title']/a")).GetValue().Replace("<em>", "").Replace("</em>", ""); var url = element.Select(Selectors.XPath("h3[@class='c-title']/a/@href")).GetValue(); var author = element.Select(Selectors.XPath(".//div/p[@class='c-author']/text()")).GetValue(); var time = string.Empty; try { time = author.Substring(author.IndexOf(" ", StringComparison.Ordinal) + 12); } catch (Exception e) { Console.WriteLine(e); throw; } var news = new BaiduNews { Keyword = keyword, Title = title, Time = time, Url = url }; page.AddTargetRequest(url, increaseDeep: false); results.Add(news); } page.AddResultItem("News", results); if (!results.Any()) { //bool success; var transcoder = new NReadabilityTranscoder(); var input = new TranscodingInput(page.Content) { //DomSerializationParams = new DomSerializationParams() //{ // DontIncludeDocTypeMetaElement = true, // DontIncludeContentTypeMetaElement = true, // DontIncludeGeneratorMetaElement = true, // DontIncludeMobileSpecificMetaElements = true, // PrettyPrint = true //} }; var text = ""; try { var result = transcoder.Transcode(input); var document = new HtmlDocument { OptionAutoCloseOnEnd = true }; document.LoadHtml(result.ExtractedContent); var node = document.DocumentNode.SelectSingleNode("//div/div/div/div"); text = node.InnerText.Trim('\r', '\n', ' '); } catch (Exception e) { Console.WriteLine(e); //throw; } page.AddResultItem("UpdateNews", new UpdateNews { Html = page.Content, Text = text, Url = page.Url }); } }
public async Task Dowload(string url, PerformContext context) { using (var client = new HttpClient()) { client.DefaultRequestHeaders.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36"); var response = await client.GetAsync(url); if (response.StatusCode != HttpStatusCode.OK) { return; } var stream = await response.Content.ReadAsStreamAsync(); byte[] bytes = new byte[stream.Length]; await stream.ReadAsync(bytes, 0, bytes.Length); var isUTF8 = IsTextUTF8(ref bytes); Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); Encoding encoding; if (isUTF8) { encoding = Encoding.UTF8; } else { encoding = Encoding.GetEncoding("GBK"); } var html = encoding.GetString(bytes); //var document = new HtmlDocument { OptionAutoCloseOnEnd = true }; //document.LoadHtml(html); //foreach (var selectNode in document.DocumentNode.SelectNodes("//meta")) //{ // if (selectNode.Attributes["http-equiv"]?.Value == "Content-Type") // { // var contentType = selectNode.Attributes["content"].Value; // var match = Regex.Match(contentType, "charset=(?<encoding>[a-zA-Z0-9\\-]*)"); // if (match.Success) // { // var encodingName = match.Groups["encoding"].Value; // html = Encoding.GetEncoding(encodingName).GetString(bytes); // break; // } // } // if (selectNode.Attributes["charset"] != null) // { // var encodingName = selectNode.Attributes["charset"].Value; // html = Encoding.GetEncoding(encodingName).GetString(bytes); // break; // } //} //document.LoadHtml(html); //using (var ms = new MemoryStream()) //using (StreamWriter sw = new StreamWriter(ms, Encoding.UTF8)) //{ // document.Save(sw); // ms.Position = 0; // var xdoc = XDocument.Load(ms); // //using (var sr = new StreamReader(ms)) // //{ // // html = await sr.ReadToEndAsync(); // //} //} //var html = await response.Content.ReadAsStringAsync(); if (string.IsNullOrEmpty(html)) { return; } var transcoder = new NReadabilityTranscoder(); var input = new TranscodingInput(html); try { SgmlDomBuilder builder = new SgmlDomBuilder(); var s = builder.BuildDocument(html); var result = transcoder.Transcode(input); var document = new HtmlDocument { OptionAutoCloseOnEnd = true }; document.LoadHtml(result.ExtractedContent); var node = document.DocumentNode.SelectSingleNode("//div/div/div/div"); var text = node.InnerText.Trim('\r', '\n', ' ', '\t'); context.WriteLine("抽取内容为:"); context.WriteLine(text); const string cmdText = @"UPDATE [dbo].[BaiduNews] SET [Html]=@Html,[Text]=@Text WHERE [Url]=@Url"; await _connection.ExecuteAsync(cmdText, new { Html = html, Text = text, Url = url }); await _connection.ExecuteAsync( @"UPDATE a SET a.[NewsCount]=a.[NewsCount]+1 FROM [dbo].[Monitor] a JOIN [dbo].[BaiduNews] b ON a.[Tag]=b.[Keyword] WHERE b.[Url]=@Url", new { Url = url }); } catch (Exception e) { context.WriteLine(e); } } }