public void Download(string url, string targetDir, string targetFname) { string fname = targetFname.RemoveColon(); string filepath = Path.Combine(targetDir, fname); //ensure it respects mppl filepath = Utilities.TrimPathPart(filepath, _futureleanCourse.Max_path_part_len); WebHeaderCollection responseHeaders = _futureleanCourse._client.ResponseHeaders; int contentLength = GetContentLength(responseHeaders); bool isFileNeeded = IsFileNeeded(filepath, contentLength, fname); if (isFileNeeded) { if (Path.GetExtension(filepath) == ".html") { string content = _futureleanCourse._client.DownloadString(url); NReadabilityTranscoder transcoder = new NReadabilityTranscoder(); TranscodingInput tiInput = new TranscodingInput(content); TranscodingResult transcodedContent = transcoder.Transcode(tiInput); //.Transcode(content, out success); File.WriteAllText(filepath, transcodedContent.ExtractedContent); } else { _futureleanCourse._client.DownloadFile(url, filepath); } } }
/// <summary> /// Extracts the readable information. /// </summary> /// <param name="uri">The URI.</param> /// <param name="textStream">The text stream.</param> /// <param name="options">The options.</param> /// <param name="encoding">The encoding.</param> /// <returns></returns> protected TranscodingResult ExtractReadableInformation( Uri uri, Stream textStream, ReadOptions options, Encoding encoding = null) { // response stream to text textStream.Position = 0; StreamReader streamReader = new StreamReader(textStream, encoding ?? Encoding.UTF8); _rawHTML = streamReader.ReadToEnd(); // set properties for processing TranscodingInput transcodingInput = new TranscodingInput(_rawHTML) { Url = uri.ToString(), DomSerializationParams = new DomSerializationParams() { BodyOnly = !options.HasHeaderTags, NoHeadline = !options.HasHeadline, PrettyPrint = options.PrettyPrint, DontIncludeContentTypeMetaElement = true, DontIncludeMobileSpecificMetaElements = true, DontIncludeDocTypeMetaElement = false, DontIncludeGeneratorMetaElement = true, ReplaceImagesWithPlaceholders = options.ReplaceImagesWithPlaceholders } }; // process/transcode HTML return(_transcoder.Transcode(transcodingInput)); }
private static String GetWebpageContents(String url) { var nreadabilityTranscoder = new NReadabilityTranscoder(); using (var wc = new WebClient()) { var rawHtml = wc.DownloadString(url); var transcodingInput = new TranscodingInput(rawHtml); var extractedHtml = nreadabilityTranscoder.Transcode(transcodingInput).ExtractedContent; var pageHtml = new HtmlDocument(); pageHtml.LoadHtml(extractedHtml); return(pageHtml.DocumentNode.SelectSingleNode("//body").InnerText); } }
public void Transcode_cleans_up_title_after_extracting_it() { // arrange const string expectedTitle = "Гостиница - Ги де Мопассан"; const string title = "Гостиница\n- \r Ги \t де \n\n \r Мопассан \r\n"; const string htmlContent = "<html><head><title>" + title + "</title></head><body></body></html>"; var transcodingInput = new TranscodingInput(htmlContent); // act TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput); // assert Assert.IsTrue(transcodingResult.TitleExtracted); Assert.AreEqual(expectedTitle, transcodingResult.ExtractedTitle); }
public void Transcode_can_extract_title_from_header() { // arrange const string expectedTitle = "Some title ąęłóżźńć"; const string htmlContent = "<html><head><title>" + expectedTitle + "</title></head><body></body></html>"; var transcodingInput = new TranscodingInput(htmlContent); // act TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput); // assert Assert.IsTrue(transcodingResult.TitleExtracted); Assert.AreEqual(expectedTitle, transcodingResult.ExtractedTitle); }
public void Transcode_can_extract_title_from_body_h2() { // arrange const string expectedTitle = "Some title ąęłóżźńć"; const string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>"; const string htmlContent = "<html><body><div id=\"main\"><h2>" + expectedTitle + "</h2>" + dummyParagraphs + "</div></body></html>"; var transcodingInput = new TranscodingInput(htmlContent); // act TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput); // assert Assert.IsTrue(transcodingResult.TitleExtracted); Assert.AreEqual(expectedTitle, transcodingResult.ExtractedTitle); }
public void TestSampleInputs([Values(1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 14, 15, 16)]int sampleInputNumber) { // arrange string sampleInputNumberStr = sampleInputNumber.ToString().PadLeft(2, '0'); string content = File.ReadAllText(string.Format(@"SampleInput\SampleInput_{0}.html", sampleInputNumberStr)); var transcodingInput = new TranscodingInput(content); transcodingInput.Url = GetSampleInputUrl(sampleInputNumber); // act TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput); // assert Assert.IsTrue(transcodingResult.ContentExtracted); const string outputDir = "SampleOutput"; if (!Directory.Exists(outputDir)) { Directory.CreateDirectory(outputDir); } string extractedContent = transcodingResult.ExtractedContent; File.WriteAllText( Path.Combine(outputDir, string.Format("SampleOutput_{0}.html", sampleInputNumberStr)), extractedContent, Encoding.UTF8); switch (sampleInputNumber) { case 1: // washingtonpost.com - "Court Puts Off Decision On Indefinite Detention" Assert.IsTrue(extractedContent.Contains("The Supreme Court yesterday vacated a lower")); Assert.IsTrue(extractedContent.Contains("The justices did not rule on the merits")); Assert.IsTrue(extractedContent.Contains("But the government said the issues were now")); break; case 2: // devBlogi.pl - "Po co nam testerzy?" Assert.IsTrue(extractedContent.Contains("Moja siostra sprawiła swoim dzieciom szczeniaczka")); Assert.IsTrue(extractedContent.Contains("Z tresowaniem psów jest tak, że reakcja musi być")); Assert.IsTrue(extractedContent.Contains("Korzystając z okazji, chcielibyśmy dowiedzieć się")); break; case 3: // codinghorror.com - "Welcome Back Comments" Assert.IsTrue(extractedContent.Contains("I apologize for the scarcity of updates lately.")); Assert.IsTrue(extractedContent.Contains("Most of all, I blame myself.")); Assert.IsTrue(extractedContent.Contains("And, most of all, thanks to")); break; case 4: // sample page; only with paragraphs Assert.IsTrue(extractedContent.Contains("Lorem ipsum dolor sit amet, consectetur adipiscing elit.")); Assert.IsTrue(extractedContent.Contains("Mauris nec massa ante, id fringilla nisi.")); Assert.IsTrue(extractedContent.Contains("Nulla facilisi. Proin lacinia venenatis elit, nec ornare elit varius eu.")); Assert.IsTrue(extractedContent.Contains("Duis vitae ultricies nibh.")); Assert.IsTrue(extractedContent.Contains("Vestibulum dictum iaculis nisl, lobortis luctus justo porttitor eu.")); break; case 5: // mnmlist.com - "clear distractions" Assert.IsTrue(extractedContent.Contains("When it comes to minimalism in")); Assert.IsTrue(extractedContent.Contains("Here’s how:")); Assert.IsTrue(extractedContent.Contains("Set limits on your work hours. If your time is limited, you’ll find ways to make the most of that limited time.")); break; case 6: // sample page; nbsp Assert.IsTrue(extractedContent.Contains("1. Item 1.")); // there's a non-breaking space here break; case 7: // http://nplusonemag.com/treasure-island Assert.IsTrue(extractedContent.Contains("stretched out storylines")); Assert.IsTrue(extractedContent.Contains("It is no longer a smart social move to brag about not owning a television.")); Assert.IsTrue(extractedContent.Contains("Of course, some habits can be hard to give up completely.")); break; case 8: // NYTimes leading paragraph Assert.IsTrue(extractedContent.Contains("freed from house arrest on Saturday, setting her on the path")); Assert.IsTrue(extractedContent.Contains("confrontation with the generals who had kept her out of the public eye")); Assert.IsTrue(extractedContent.Contains("Western capitals was one of celebration")); break; case 9: // http://www.udidahan.com/2010/08/31/race-conditions-dont-exist/ - rich sidebar should not be identified as main content Assert.IsTrue(extractedContent.Contains("Not in the business world anyway.")); Assert.IsTrue(extractedContent.Contains("we could look at modeling the acceptance")); Assert.IsTrue(extractedContent.Contains("Keep an eye out.")); break; case 10: // http://www.slate.com/articles/technology/technology/2011/10/steve_jobs_biography_the_new_book_doesn_t_explain_what_made_the_.single.html Assert.IsTrue(extractedContent.Contains("In the aftermath of his resignation and then his death")); Assert.IsTrue(extractedContent.Contains("It turns out, though, that he was much worse than you ever suspected.")); Assert.IsTrue(extractedContent.Contains("But Isaacson has compiled so many instances")); Assert.IsTrue(extractedContent.Contains("Yet Jobs also said that he wanted a biographer")); Assert.IsTrue(extractedContent.Contains("He embodied so many contradictions")); Assert.IsTrue(extractedContent.Contains("When friends and colleagues offer theories about Jobs")); Assert.IsTrue(extractedContent.Contains("Isaacson tries valiantly to add some depth to the profile.")); Assert.IsTrue(extractedContent.Contains("Jobs also seemed to suspect that he wasn")); Assert.IsTrue(extractedContent.Contains("Instead of offering any substantive explanations")); Assert.IsTrue(extractedContent.Contains("death prompted a flurry of hagiographic tributes")); Assert.IsTrue(extractedContent.Contains("last 15 years of life, something in him changed")); break; case 11: // http://www.slate.com/articles/news_and_politics/foreigners/2011/10/jordan_s_king_abdullah_interviewed_the_arab_spring_is_a_disaster.single.html Assert.IsTrue(extractedContent.Contains("How do you see")); Assert.IsTrue(extractedContent.Contains("I went to Egypt after visiting")); Assert.IsTrue(extractedContent.Contains("How did your visit to Egypt go?")); Assert.IsTrue(extractedContent.Contains("We had a very good meeting.")); Assert.IsTrue(extractedContent.Contains("I think it is astounding that Tantawi")); Assert.IsTrue(extractedContent.Contains("The feeling I got from the Egyptian leadership")); Assert.IsTrue(extractedContent.Contains("From the streets")); Assert.IsTrue(extractedContent.Contains("No, from the West.")); Assert.IsTrue(extractedContent.Contains("They saw that Mubarak was sacrificed")); Assert.IsTrue(extractedContent.Contains("So they are being very cautious in the decisions they are taking.")); Assert.IsTrue(extractedContent.Contains("Do you and other leaders")); Assert.IsTrue(extractedContent.Contains("I think everybody is wary")); Assert.IsTrue(extractedContent.Contains("And Jordan?")); Assert.IsTrue(extractedContent.Contains("I think two things make Jordan stand out.")); Assert.IsTrue(extractedContent.Contains("Do you think President Bashi")); Assert.IsTrue(extractedContent.Contains("We have had very limited defectors")); Assert.IsTrue(extractedContent.Contains("Does that mean you have talked to")); Assert.IsTrue(extractedContent.Contains("I spoke to Bashar al-Assad twice in the springtime.")); Assert.IsTrue(extractedContent.Contains("People are asking about an alternative")); Assert.IsTrue(extractedContent.Contains("I think nobody has an answer to Syria.")); Assert.IsTrue(extractedContent.Contains("Do you think they can win?")); Assert.IsTrue(extractedContent.Contains("My view is when you use violence on your people")); Assert.IsTrue(extractedContent.Contains("What is your assessment of Libya")); Assert.IsTrue(extractedContent.Contains("It took everybody by surprise.")); Assert.IsTrue(extractedContent.Contains("So you think the death of Colonel Gaddafi")); Assert.IsTrue(extractedContent.Contains("There is an old saying that peace")); Assert.IsTrue(extractedContent.Contains("I heard that Hamas leader Khalid Mashal")); Assert.IsTrue(extractedContent.Contains("If he comes here, it is part of looking at Palestinian reconciliation.")); Assert.IsTrue(extractedContent.Contains("You support Palestinian President Mahmoud Abba")); Assert.IsTrue(extractedContent.Contains("It is out of desperation and frustration that they are going to the U.N.")); Assert.IsTrue(extractedContent.Contains("Like our elections?")); Assert.IsTrue(extractedContent.Contains("It is a disaster. You have seen what has happened in Egypt")); Assert.IsTrue(extractedContent.Contains("The Israelis are worried the Egyptians will break the treaty.")); Assert.IsTrue(extractedContent.Contains("That is a very, very strong possibility.")); Assert.IsTrue(extractedContent.Contains("Do you intend to support Jordan")); Assert.IsTrue(extractedContent.Contains("We have a peace treaty with Israel and we will continue")); Assert.IsTrue(extractedContent.Contains("A lot of Israelis think your recent statements")); Assert.IsTrue(extractedContent.Contains("know if they are hostile. What I am saying is they are missing an opportunity")); Assert.IsTrue(extractedContent.Contains("I always look at the glass half full and I")); Assert.IsTrue(extractedContent.Contains("What did you think of Israel Prime Minister Benjamin Netanyahu")); Assert.IsTrue(extractedContent.Contains("It is politics at the end of the day.")); Assert.IsTrue(extractedContent.Contains("It was strange for Israel to be negotiating with Hamas.")); Assert.IsTrue(extractedContent.Contains("I think all of us have been asking each othe")); Assert.IsTrue(extractedContent.Contains("You just appointed a new prime minister.")); Assert.IsTrue(extractedContent.Contains("Yes, for the past six months we have listened to what people want")); Assert.IsTrue(extractedContent.Contains("If you look five years down the line, do you see yourself relinquishing some power to the parliament")); Assert.IsTrue(extractedContent.Contains("Probably sooner. We haven")); Assert.IsTrue(extractedContent.Contains("You will still appoint the Senate")); Assert.IsTrue(extractedContent.Contains("There are two options. If there is a new parliament next year")); Assert.IsTrue(extractedContent.Contains("I think we are facing the same challenges as everyone in the West.")); Assert.IsTrue(extractedContent.Contains("Once you have people rioting in the streets, how do you get foreign")); Assert.IsTrue(extractedContent.Contains("But you made a deal with the Saudis")); Assert.IsTrue(extractedContent.Contains("The Saudis have come through very strongly this year but")); Assert.IsTrue(extractedContent.Contains("And that is because you are having problems getting")); Assert.IsTrue(extractedContent.Contains("We are having problems because the gas pipeline keeps")); Assert.IsTrue(extractedContent.Contains("There are reports that over the next five years if you join the GCC")); Assert.IsTrue(extractedContent.Contains("There is going to be a package hopefully of at least a billion")); break; // TODO IMM HI: fix (problem with nested divs) case 12: // http://www.telegraph.co.uk/comment/personal-view/8841737/What-Gilad-Shalit-tells-us-about-the-respect-for-life-in-Europe-Israel-and-Palestine.html Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*One of the supreme ironies among the European moral stances")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*And yet when that same Europe turns its gaze on the Middle East")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*Normally, this would not be even worth mentioning.")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*Israel first outlawed the death penalty in 1954")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*Note that Israel passed this law five years")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*If the Israelis had hundreds of terrorists")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*Palestine, on the other hand")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*The trade of over a thousand Palestinians")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*If a European, concerned about the nature")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*So instead of helping Europeans")); Assert.IsTrue(Regex.IsMatch(extractedContent, "at the return of prisoners, and")); Assert.IsTrue(Regex.IsMatch(extractedContent, "in order to present the moral equivalence of all the")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*In acquiescing with a narrative in which hatred and murder")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*It may seem cost-free to Westerners")); break; // TODO IMM HI: fix (problem with nested divs) case 13: // same URL as 12 but processed by Instapaper first Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*One of the supreme ironies among the European moral stances")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*And yet when that same Europe turns its gaze on the Middle East")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*Normally, this would not be even worth mentioning.")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*Israel first outlawed the death penalty in 1954")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*Note that Israel passed this law five years")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*If the Israelis had hundreds of terrorists")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*Palestine, on the other hand")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*The trade of over a thousand Palestinians")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*If a European, concerned about the nature")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*So instead of helping Europeans")); Assert.IsTrue(Regex.IsMatch(extractedContent, "at the return of prisoners, and")); Assert.IsTrue(Regex.IsMatch(extractedContent, "in order to present the moral equivalence of all the")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*In acquiescing with a narrative in which hatred and murder")); Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*It may seem cost-free to Westerners")); break; case 14: // http://www.theverge.com/2012/5/25/3042640/samsung-galaxy-s-iii-review Assert.IsTrue(extractedContent.Contains("Samsung stops teasing and finally delivers its flagship Android device")); Assert.IsTrue(extractedContent.Contains("The extra-large size of this phone, even with its great ergonomics, may prove to be")); break; case 15: // http://www.theverge.com/2012/6/21/3032067/casio-bluetooth-g-shock-watch-gb6900-review Assert.IsTrue(extractedContent.Contains("Bank devices in the 80s, but in recent years it hasn't been quite the innovator it once was. ")); Assert.IsTrue(extractedContent.Contains("the Verge score is based on the average of the subscores below")); break; case 16: Assert.IsTrue(extractedContent.Contains("Header Level 1")); Assert.IsTrue(extractedContent.Contains("Header Level 2")); Assert.IsTrue(extractedContent.Contains("Header Level 3")); Assert.IsTrue(extractedContent.Contains("Header Level 4")); Assert.IsTrue(extractedContent.Contains("Header Level 5")); Assert.IsTrue(extractedContent.Contains("Header Level 6")); break; default: throw new NotSupportedException("Unknown sample input number (" + sampleInputNumber + "). Have you added another sample input? If so, then add appropriate asserts here as well."); } }
public void TestReplacingQueryStringLinkUrls() { // arrange string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>"; string htmlContent = "<html><body>" + dummyParagraphs + "<p><a href=\"?hello\">link</a></p>" + dummyParagraphs + "</body></html>"; var transcodingInput = new TranscodingInput(htmlContent) { Url = "http://wikipedia.org/wiki/baseArticle", }; // act TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput); // assert Assert.IsTrue(transcodingResult.ContentExtracted); Assert.IsTrue(transcodingResult.ExtractedContent.Contains("href=\"http://wikipedia.org/wiki/baseArticle?hello\"")); // arrange transcodingInput = new TranscodingInput(htmlContent) { Url = "http://wikipedia.org/wiki/baseArticle?goodbye", }; // act transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput); // assert Assert.IsTrue(transcodingResult.ContentExtracted); Assert.IsTrue(transcodingResult.ExtractedContent.Contains("href=\"http://wikipedia.org/wiki/baseArticle?hello\"")); }
protected override void Handle(Page page) { var elements = page.Selectable.SelectList(Selectors.XPath("//div[@class='result']")).Nodes(); var results = new List <BaiduNews>(); var keyword = page.Request.Extras.Aggregate("", (current, kv) => string.IsNullOrEmpty(current) ? kv.Value : $"{current},{kv.Value}"); foreach (var element in elements) { var title = element.Select(Selectors.XPath("h3[@class='c-title']/a")).GetValue().Replace("<em>", "").Replace("</em>", ""); var url = element.Select(Selectors.XPath("h3[@class='c-title']/a/@href")).GetValue(); var author = element.Select(Selectors.XPath(".//div/p[@class='c-author']/text()")).GetValue(); var time = string.Empty; try { time = author.Substring(author.IndexOf(" ", StringComparison.Ordinal) + 12); } catch (Exception e) { Console.WriteLine(e); throw; } var news = new BaiduNews { Keyword = keyword, Title = title, Time = time, Url = url }; page.AddTargetRequest(url, increaseDeep: false); results.Add(news); } page.AddResultItem("News", results); if (!results.Any()) { //bool success; var transcoder = new NReadabilityTranscoder(); var input = new TranscodingInput(page.Content) { //DomSerializationParams = new DomSerializationParams() //{ // DontIncludeDocTypeMetaElement = true, // DontIncludeContentTypeMetaElement = true, // DontIncludeGeneratorMetaElement = true, // DontIncludeMobileSpecificMetaElements = true, // PrettyPrint = true //} }; var text = ""; try { var result = transcoder.Transcode(input); var document = new HtmlDocument { OptionAutoCloseOnEnd = true }; document.LoadHtml(result.ExtractedContent); var node = document.DocumentNode.SelectSingleNode("//div/div/div/div"); text = node.InnerText.Trim('\r', '\n', ' '); } catch (Exception e) { Console.WriteLine(e); //throw; } page.AddResultItem("UpdateNews", new UpdateNews { Html = page.Content, Text = text, Url = page.Url }); } }
public void TestImageSourceTransformer() { // arrange Func<AttributeTransformationInput, AttributeTransformationResult> imgSrcTransformer = input => new AttributeTransformationResult { TransformedValue = string.Format("http://imageresizer.com/u={0}", input.AttributeValue), OriginalValueAttributeName = "origsrc", }; string originalSrcValue = "http://example.com/some_image.jpg"; string expectedSrcValue = imgSrcTransformer.Invoke(new AttributeTransformationInput { AttributeValue = originalSrcValue, Element = null }).TransformedValue; string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>"; string htmlContent = "<html><body>" + dummyParagraphs + "<p><img src=\"" + originalSrcValue + "\" /></p>" + dummyParagraphs + "</body></html>"; var nReadabilityTranscoder = new NReadabilityTranscoder { ImageSourceTranformer = imgSrcTransformer, }; var transcodingInput = new TranscodingInput(htmlContent) { Url = "http://immortal.pl/", }; // act TranscodingResult transcodingResult = nReadabilityTranscoder.Transcode(transcodingInput); // assert Assert.IsTrue(transcodingResult.ContentExtracted); Assert.IsTrue(transcodingResult.ExtractedContent.Contains("src=\"" + expectedSrcValue + "\"")); Assert.IsTrue(transcodingResult.ExtractedContent.Contains("origsrc=\"" + originalSrcValue + "\"")); }
public void TestEmptyArticle() { // arrange const string htmlContent = "<html><body></body></html>"; var transcodingInput = new TranscodingInput(htmlContent) { Url = "http://wikipedia.org/wiki/baseArticle", }; // act TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput); // assert Assert.IsFalse(transcodingResult.ContentExtracted); }
public void TestAnchorHrefTransformer() { // arrange Func<AttributeTransformationInput, AttributeTransformationResult> anchorHrefTransformer = input => new AttributeTransformationResult { TransformedValue = string.Format("http://redirector.com/u={0}", input.AttributeValue), OriginalValueAttributeName = "orighref", }; string originalHrefValue = "http://example.com/some_article.html"; string expectedHrefValue = anchorHrefTransformer.Invoke(new AttributeTransformationInput { AttributeValue = originalHrefValue, Element = null }).TransformedValue; string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>"; string htmlContent = "<html><body>" + dummyParagraphs + "<p><a href=\"" + originalHrefValue + "\">Some article</a></p>" + dummyParagraphs + "</body></html>"; var nReadabilityTranscoder = new NReadabilityTranscoder { AnchorHrefTranformer = anchorHrefTransformer, }; var transcodingInput = new TranscodingInput(htmlContent) { Url = "http://immortal.pl/", }; // act TranscodingResult transcodingResult = nReadabilityTranscoder.Transcode(transcodingInput); // assert Assert.IsTrue(transcodingResult.ContentExtracted); Assert.IsTrue(transcodingResult.ExtractedContent.Contains("href=\"" + expectedHrefValue + "\"")); Assert.IsTrue(transcodingResult.ExtractedContent.Contains("orighref=\"" + originalHrefValue + "\"")); }
public void Output_contains_meta_generator_element() { // arrange var transcodingInput = new TranscodingInput("test"); // act TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput); // assert Assert.IsTrue(transcodingResult.ExtractedContent.Contains("meta name=\"Generator\"")); }
public void MetaViewportElementShouldBeRemoved() { // arrange const string metaViewportElementStr = "<meta name=\"viewport\" content=\"width=1000\" />"; const string htmlContent = "<html><head>" + metaViewportElementStr + "</head><body><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p></body></html>"; var transcodingInput = new TranscodingInput(htmlContent) { Url = "http://wikipedia.org/wiki/baseArticle", }; // act TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput); // assert Assert.IsTrue(transcodingResult.ContentExtracted); Assert.IsFalse(transcodingResult.ExtractedContent.Contains(metaViewportElementStr)); }
private void TestReplacingImageUrl(string srcAttribute, string url, string expectedImageUrl) { // arrange string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>"; string htmlContent = "<html><body>" + dummyParagraphs + "<p><img src=\"" + srcAttribute + "\" /></p>" + dummyParagraphs + "</body></html>"; var transcodingInput = new TranscodingInput(htmlContent) { Url = url, }; // act TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput); // assert Assert.IsTrue(transcodingResult.ContentExtracted); Assert.IsTrue( transcodingResult.ExtractedContent.Contains("src=\"" + expectedImageUrl + "\""), string.Format("Image url replacement failed. Src attribute: {0}, base url: {1}, expected image url: {2}", srcAttribute, url, expectedImageUrl)); }
public void TestMobileHeaders() { // arrange string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>"; string htmlContent = "<html><body>" + dummyParagraphs + "</body></html>"; var transcodingInput = new TranscodingInput(htmlContent) { Url = "http://wikipedia.org/wiki/baseArticle", }; // act TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput); // assert Assert.IsTrue(transcodingResult.ContentExtracted); Assert.IsTrue(transcodingResult.ExtractedContent.Contains("<meta name=\"HandheldFriendly\" content=\"true\" />")); }
public async Task Dowload(string url, PerformContext context) { using (var client = new HttpClient()) { client.DefaultRequestHeaders.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36"); var response = await client.GetAsync(url); if (response.StatusCode != HttpStatusCode.OK) { return; } var stream = await response.Content.ReadAsStreamAsync(); byte[] bytes = new byte[stream.Length]; await stream.ReadAsync(bytes, 0, bytes.Length); var isUTF8 = IsTextUTF8(ref bytes); Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); Encoding encoding; if (isUTF8) { encoding = Encoding.UTF8; } else { encoding = Encoding.GetEncoding("GBK"); } var html = encoding.GetString(bytes); //var document = new HtmlDocument { OptionAutoCloseOnEnd = true }; //document.LoadHtml(html); //foreach (var selectNode in document.DocumentNode.SelectNodes("//meta")) //{ // if (selectNode.Attributes["http-equiv"]?.Value == "Content-Type") // { // var contentType = selectNode.Attributes["content"].Value; // var match = Regex.Match(contentType, "charset=(?<encoding>[a-zA-Z0-9\\-]*)"); // if (match.Success) // { // var encodingName = match.Groups["encoding"].Value; // html = Encoding.GetEncoding(encodingName).GetString(bytes); // break; // } // } // if (selectNode.Attributes["charset"] != null) // { // var encodingName = selectNode.Attributes["charset"].Value; // html = Encoding.GetEncoding(encodingName).GetString(bytes); // break; // } //} //document.LoadHtml(html); //using (var ms = new MemoryStream()) //using (StreamWriter sw = new StreamWriter(ms, Encoding.UTF8)) //{ // document.Save(sw); // ms.Position = 0; // var xdoc = XDocument.Load(ms); // //using (var sr = new StreamReader(ms)) // //{ // // html = await sr.ReadToEndAsync(); // //} //} //var html = await response.Content.ReadAsStringAsync(); if (string.IsNullOrEmpty(html)) { return; } var transcoder = new NReadabilityTranscoder(); var input = new TranscodingInput(html); try { SgmlDomBuilder builder = new SgmlDomBuilder(); var s = builder.BuildDocument(html); var result = transcoder.Transcode(input); var document = new HtmlDocument { OptionAutoCloseOnEnd = true }; document.LoadHtml(result.ExtractedContent); var node = document.DocumentNode.SelectSingleNode("//div/div/div/div"); var text = node.InnerText.Trim('\r', '\n', ' ', '\t'); context.WriteLine("抽取内容为:"); context.WriteLine(text); const string cmdText = @"UPDATE [dbo].[BaiduNews] SET [Html]=@Html,[Text]=@Text WHERE [Url]=@Url"; await _connection.ExecuteAsync(cmdText, new { Html = html, Text = text, Url = url }); await _connection.ExecuteAsync( @"UPDATE a SET a.[NewsCount]=a.[NewsCount]+1 FROM [dbo].[Monitor] a JOIN [dbo].[BaiduNews] b ON a.[Tag]=b.[Keyword] WHERE b.[Url]=@Url", new { Url = url }); } catch (Exception e) { context.WriteLine(e); } } }