Пример #1
0
        public void Download(string url, string targetDir, string targetFname)
        {
            string fname = targetFname.RemoveColon();

            string filepath = Path.Combine(targetDir, fname);

            //ensure it respects mppl
            filepath = Utilities.TrimPathPart(filepath, _futureleanCourse.Max_path_part_len);

            WebHeaderCollection responseHeaders = _futureleanCourse._client.ResponseHeaders;
            int  contentLength = GetContentLength(responseHeaders);
            bool isFileNeeded  = IsFileNeeded(filepath, contentLength, fname);

            if (isFileNeeded)
            {
                if (Path.GetExtension(filepath) == ".html")
                {
                    string content = _futureleanCourse._client.DownloadString(url);
                    NReadabilityTranscoder transcoder        = new NReadabilityTranscoder();
                    TranscodingInput       tiInput           = new TranscodingInput(content);
                    TranscodingResult      transcodedContent = transcoder.Transcode(tiInput);
                    //.Transcode(content, out success);
                    File.WriteAllText(filepath, transcodedContent.ExtractedContent);
                }
                else
                {
                    _futureleanCourse._client.DownloadFile(url, filepath);
                }
            }
        }
Пример #2
0
        /// <summary>
        /// Extracts the readable information.
        /// </summary>
        /// <param name="uri">The URI.</param>
        /// <param name="textStream">The text stream.</param>
        /// <param name="options">The options.</param>
        /// <param name="encoding">The encoding.</param>
        /// <returns></returns>
        protected TranscodingResult ExtractReadableInformation(
            Uri uri,
            Stream textStream,
            ReadOptions options,
            Encoding encoding = null)
        {
            // response stream to text
            textStream.Position = 0;
            StreamReader streamReader = new StreamReader(textStream, encoding ?? Encoding.UTF8);

            _rawHTML = streamReader.ReadToEnd();

            // set properties for processing
            TranscodingInput transcodingInput = new TranscodingInput(_rawHTML)
            {
                Url = uri.ToString(),
                DomSerializationParams = new DomSerializationParams()
                {
                    BodyOnly    = !options.HasHeaderTags,
                    NoHeadline  = !options.HasHeadline,
                    PrettyPrint = options.PrettyPrint,
                    DontIncludeContentTypeMetaElement     = true,
                    DontIncludeMobileSpecificMetaElements = true,
                    DontIncludeDocTypeMetaElement         = false,
                    DontIncludeGeneratorMetaElement       = true,
                    ReplaceImagesWithPlaceholders         = options.ReplaceImagesWithPlaceholders
                }
            };

            // process/transcode HTML
            return(_transcoder.Transcode(transcodingInput));
        }
Пример #3
0
        private static String GetWebpageContents(String url)
        {
            var nreadabilityTranscoder = new NReadabilityTranscoder();

            using (var wc = new WebClient())
            {
                var rawHtml          = wc.DownloadString(url);
                var transcodingInput = new TranscodingInput(rawHtml);
                var extractedHtml    = nreadabilityTranscoder.Transcode(transcodingInput).ExtractedContent;
                var pageHtml         = new HtmlDocument();
                pageHtml.LoadHtml(extractedHtml);
                return(pageHtml.DocumentNode.SelectSingleNode("//body").InnerText);
            }
        }
        public void Transcode_cleans_up_title_after_extracting_it()
        {
            // arrange
              const string expectedTitle = "Гостиница - Ги де Мопассан";
              const string title = "Гостиница\n-  \r Ги  \t  де \n\n \r Мопассан \r\n";
              const string htmlContent = "<html><head><title>" + title + "</title></head><body></body></html>";

              var transcodingInput = new TranscodingInput(htmlContent);

              // act
              TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput);

              // assert
              Assert.IsTrue(transcodingResult.TitleExtracted);
              Assert.AreEqual(expectedTitle, transcodingResult.ExtractedTitle);
        }
        public void Transcode_can_extract_title_from_header()
        {
            // arrange
              const string expectedTitle = "Some title ąęłóżźńć";
              const string htmlContent = "<html><head><title>" + expectedTitle + "</title></head><body></body></html>";

              var transcodingInput = new TranscodingInput(htmlContent);

              // act
              TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput);

              // assert
              Assert.IsTrue(transcodingResult.TitleExtracted);
              Assert.AreEqual(expectedTitle, transcodingResult.ExtractedTitle);
        }
        public void Transcode_can_extract_title_from_body_h2()
        {
            // arrange
              const string expectedTitle = "Some title ąęłóżźńć";
              const string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>";
              const string htmlContent = "<html><body><div id=\"main\"><h2>" + expectedTitle + "</h2>" + dummyParagraphs + "</div></body></html>";

              var transcodingInput = new TranscodingInput(htmlContent);

              // act
              TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput);

              // assert
              Assert.IsTrue(transcodingResult.TitleExtracted);
              Assert.AreEqual(expectedTitle, transcodingResult.ExtractedTitle);
        }
        public void TestSampleInputs([Values(1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 14, 15, 16)]int sampleInputNumber)
        {
            // arrange
              string sampleInputNumberStr = sampleInputNumber.ToString().PadLeft(2, '0');
              string content = File.ReadAllText(string.Format(@"SampleInput\SampleInput_{0}.html", sampleInputNumberStr));
              var transcodingInput = new TranscodingInput(content);

              transcodingInput.Url = GetSampleInputUrl(sampleInputNumber);

              // act
              TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput);

              // assert
              Assert.IsTrue(transcodingResult.ContentExtracted);

              const string outputDir = "SampleOutput";

              if (!Directory.Exists(outputDir))
              {
            Directory.CreateDirectory(outputDir);
              }

              string extractedContent = transcodingResult.ExtractedContent;

              File.WriteAllText(
            Path.Combine(outputDir, string.Format("SampleOutput_{0}.html", sampleInputNumberStr)),
            extractedContent,
            Encoding.UTF8);

              switch (sampleInputNumber)
              {
            case 1: // washingtonpost.com - "Court Puts Off Decision On Indefinite Detention"
              Assert.IsTrue(extractedContent.Contains("The Supreme Court yesterday vacated a lower"));
              Assert.IsTrue(extractedContent.Contains("The justices did not rule on the merits"));
              Assert.IsTrue(extractedContent.Contains("But the government said the issues were now"));
              break;

            case 2: // devBlogi.pl - "Po co nam testerzy?"
              Assert.IsTrue(extractedContent.Contains("Moja siostra sprawiła swoim dzieciom szczeniaczka"));
              Assert.IsTrue(extractedContent.Contains("Z tresowaniem psów jest tak, że reakcja musi być"));
              Assert.IsTrue(extractedContent.Contains("Korzystając z okazji, chcielibyśmy dowiedzieć się"));
              break;

            case 3: // codinghorror.com - "Welcome Back Comments"
              Assert.IsTrue(extractedContent.Contains("I apologize for the scarcity of updates lately."));
              Assert.IsTrue(extractedContent.Contains("Most of all, I blame myself."));
              Assert.IsTrue(extractedContent.Contains("And, most of all, thanks to"));
              break;

            case 4: // sample page; only with paragraphs
              Assert.IsTrue(extractedContent.Contains("Lorem ipsum dolor sit amet, consectetur adipiscing elit."));
              Assert.IsTrue(extractedContent.Contains("Mauris nec massa ante, id fringilla nisi."));
              Assert.IsTrue(extractedContent.Contains("Nulla facilisi. Proin lacinia venenatis elit, nec ornare elit varius eu."));
              Assert.IsTrue(extractedContent.Contains("Duis vitae ultricies nibh."));
              Assert.IsTrue(extractedContent.Contains("Vestibulum dictum iaculis nisl, lobortis luctus justo porttitor eu."));
              break;

            case 5: // mnmlist.com - "clear distractions"
              Assert.IsTrue(extractedContent.Contains("When it comes to minimalism in"));
              Assert.IsTrue(extractedContent.Contains("Here’s how:"));
              Assert.IsTrue(extractedContent.Contains("Set limits on your work hours. If your time is limited, you’ll find ways to make the most of that limited time."));
              break;

            case 6: // sample page; nbsp
              Assert.IsTrue(extractedContent.Contains("1.  Item 1.")); // there's a non-breaking space here
              break;

            case 7: // http://nplusonemag.com/treasure-island
              Assert.IsTrue(extractedContent.Contains("stretched out storylines"));
              Assert.IsTrue(extractedContent.Contains("It is no longer a smart social move to brag about not owning a television."));
              Assert.IsTrue(extractedContent.Contains("Of course, some habits can be hard to give up completely."));
              break;

            case 8:  // NYTimes leading paragraph
              Assert.IsTrue(extractedContent.Contains("freed from house arrest on Saturday, setting her on the path"));
              Assert.IsTrue(extractedContent.Contains("confrontation with the generals who had kept her out of the public eye"));
              Assert.IsTrue(extractedContent.Contains("Western capitals was one of celebration"));
              break;

            case 9:  // http://www.udidahan.com/2010/08/31/race-conditions-dont-exist/ - rich sidebar should not be identified as main content
              Assert.IsTrue(extractedContent.Contains("Not in the business world anyway."));
              Assert.IsTrue(extractedContent.Contains("we could look at modeling the acceptance"));
              Assert.IsTrue(extractedContent.Contains("Keep an eye out."));
              break;

            case 10:  // http://www.slate.com/articles/technology/technology/2011/10/steve_jobs_biography_the_new_book_doesn_t_explain_what_made_the_.single.html
              Assert.IsTrue(extractedContent.Contains("In the aftermath of his resignation and then his death"));
              Assert.IsTrue(extractedContent.Contains("It turns out, though, that he was much worse than you ever suspected."));
              Assert.IsTrue(extractedContent.Contains("But Isaacson has compiled so many instances"));
              Assert.IsTrue(extractedContent.Contains("Yet Jobs also said that he wanted a biographer"));
              Assert.IsTrue(extractedContent.Contains("He embodied so many contradictions"));
              Assert.IsTrue(extractedContent.Contains("When friends and colleagues offer theories about Jobs"));
              Assert.IsTrue(extractedContent.Contains("Isaacson tries valiantly to add some depth to the profile."));
              Assert.IsTrue(extractedContent.Contains("Jobs also seemed to suspect that he wasn"));
              Assert.IsTrue(extractedContent.Contains("Instead of offering any substantive explanations"));
              Assert.IsTrue(extractedContent.Contains("death prompted a flurry of hagiographic tributes"));
              Assert.IsTrue(extractedContent.Contains("last 15 years of life, something in him changed"));
              break;

            case 11: // http://www.slate.com/articles/news_and_politics/foreigners/2011/10/jordan_s_king_abdullah_interviewed_the_arab_spring_is_a_disaster.single.html
              Assert.IsTrue(extractedContent.Contains("How do you see"));
              Assert.IsTrue(extractedContent.Contains("I went to Egypt after visiting"));
              Assert.IsTrue(extractedContent.Contains("How did your visit to Egypt go?"));
              Assert.IsTrue(extractedContent.Contains("We had a very good meeting."));
              Assert.IsTrue(extractedContent.Contains("I think it is astounding that Tantawi"));
              Assert.IsTrue(extractedContent.Contains("The feeling I got from the Egyptian leadership"));
              Assert.IsTrue(extractedContent.Contains("From the streets"));
              Assert.IsTrue(extractedContent.Contains("No, from the West."));
              Assert.IsTrue(extractedContent.Contains("They saw that Mubarak was sacrificed"));
              Assert.IsTrue(extractedContent.Contains("So they are being very cautious in the decisions they are taking."));
              Assert.IsTrue(extractedContent.Contains("Do you and other leaders"));
              Assert.IsTrue(extractedContent.Contains("I think everybody is wary"));
              Assert.IsTrue(extractedContent.Contains("And Jordan?"));
              Assert.IsTrue(extractedContent.Contains("I think two things make Jordan stand out."));
              Assert.IsTrue(extractedContent.Contains("Do you think President Bashi"));
              Assert.IsTrue(extractedContent.Contains("We have had very limited defectors"));
              Assert.IsTrue(extractedContent.Contains("Does that mean you have talked to"));
              Assert.IsTrue(extractedContent.Contains("I spoke to Bashar al-Assad twice in the springtime."));
              Assert.IsTrue(extractedContent.Contains("People are asking about an alternative"));
              Assert.IsTrue(extractedContent.Contains("I think nobody has an answer to Syria."));
              Assert.IsTrue(extractedContent.Contains("Do you think they can win?"));
              Assert.IsTrue(extractedContent.Contains("My view is when you use violence on your people"));
              Assert.IsTrue(extractedContent.Contains("What is your assessment of Libya"));
              Assert.IsTrue(extractedContent.Contains("It took everybody by surprise."));
              Assert.IsTrue(extractedContent.Contains("So you think the death of Colonel Gaddafi"));
              Assert.IsTrue(extractedContent.Contains("There is an old saying that peace"));
              Assert.IsTrue(extractedContent.Contains("I heard that Hamas leader Khalid Mashal"));
              Assert.IsTrue(extractedContent.Contains("If he comes here, it is part of looking at Palestinian reconciliation."));
              Assert.IsTrue(extractedContent.Contains("You support Palestinian President Mahmoud Abba"));
              Assert.IsTrue(extractedContent.Contains("It is out of desperation and frustration that they are going to the U.N."));
              Assert.IsTrue(extractedContent.Contains("Like our elections?"));
              Assert.IsTrue(extractedContent.Contains("It is a disaster. You have seen what has happened in Egypt"));
              Assert.IsTrue(extractedContent.Contains("The Israelis are worried the Egyptians will break the treaty."));
              Assert.IsTrue(extractedContent.Contains("That is a very, very strong possibility."));
              Assert.IsTrue(extractedContent.Contains("Do you intend to support Jordan"));
              Assert.IsTrue(extractedContent.Contains("We have a peace treaty with Israel and we will continue"));
              Assert.IsTrue(extractedContent.Contains("A lot of Israelis think your recent statements"));
              Assert.IsTrue(extractedContent.Contains("know if they are hostile. What I am saying is they are missing an opportunity"));
              Assert.IsTrue(extractedContent.Contains("I always look at the glass half full and I"));
              Assert.IsTrue(extractedContent.Contains("What did you think of Israel Prime Minister Benjamin Netanyahu"));
              Assert.IsTrue(extractedContent.Contains("It is politics at the end of the day."));
              Assert.IsTrue(extractedContent.Contains("It was strange for Israel to be negotiating with Hamas."));
              Assert.IsTrue(extractedContent.Contains("I think all of us have been asking each othe"));
              Assert.IsTrue(extractedContent.Contains("You just appointed a new prime minister."));
              Assert.IsTrue(extractedContent.Contains("Yes, for the past six months we have listened to what people want"));
              Assert.IsTrue(extractedContent.Contains("If you look five years down the line, do you see yourself relinquishing some power to the parliament"));
              Assert.IsTrue(extractedContent.Contains("Probably sooner. We haven"));
              Assert.IsTrue(extractedContent.Contains("You will still appoint the Senate"));
              Assert.IsTrue(extractedContent.Contains("There are two options. If there is a new parliament next year"));
              Assert.IsTrue(extractedContent.Contains("I think we are facing the same challenges as everyone in the West."));
              Assert.IsTrue(extractedContent.Contains("Once you have people rioting in the streets, how do you get foreign"));
              Assert.IsTrue(extractedContent.Contains("But you made a deal with the Saudis"));
              Assert.IsTrue(extractedContent.Contains("The Saudis have come through very strongly this year but"));
              Assert.IsTrue(extractedContent.Contains("And that is because you are having problems getting"));
              Assert.IsTrue(extractedContent.Contains("We are having problems because the gas pipeline keeps"));
              Assert.IsTrue(extractedContent.Contains("There are reports that over the next five years if you join the GCC"));
              Assert.IsTrue(extractedContent.Contains("There is going to be a package hopefully of at least a billion"));
              break;

            // TODO IMM HI: fix (problem with nested divs)
            case 12:  // http://www.telegraph.co.uk/comment/personal-view/8841737/What-Gilad-Shalit-tells-us-about-the-respect-for-life-in-Europe-Israel-and-Palestine.html
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*One of the supreme ironies among the European moral stances"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*And yet when that same Europe turns its gaze on the Middle East"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*Normally, this would not be even worth mentioning."));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*Israel first outlawed the death penalty in 1954"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*Note that Israel passed this law five years"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*If the Israelis had hundreds of terrorists"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*Palestine, on the other hand"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*The trade of over a thousand Palestinians"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*If a European, concerned about the nature"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*So instead of helping Europeans"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "at the return of prisoners, and"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "in order to present the moral equivalence of all the"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*In acquiescing with a narrative in which hatred and murder"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*It may seem cost-free to Westerners"));
              break;

            // TODO IMM HI: fix (problem with nested divs)
            case 13:  // same URL as 12 but processed by Instapaper first
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*One of the supreme ironies among the European moral stances"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*And yet when that same Europe turns its gaze on the Middle East"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*Normally, this would not be even worth mentioning."));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*Israel first outlawed the death penalty in 1954"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*Note that Israel passed this law five years"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*If the Israelis had hundreds of terrorists"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*Palestine, on the other hand"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*The trade of over a thousand Palestinians"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*If a European, concerned about the nature"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*So instead of helping Europeans"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "at the return of prisoners, and"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "in order to present the moral equivalence of all the"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*In acquiescing with a narrative in which hatred and murder"));
              Assert.IsTrue(Regex.IsMatch(extractedContent, "<p>\\s*It may seem cost-free to Westerners"));
              break;

            case 14: // http://www.theverge.com/2012/5/25/3042640/samsung-galaxy-s-iii-review
              Assert.IsTrue(extractedContent.Contains("Samsung stops teasing and finally delivers its flagship Android device"));
              Assert.IsTrue(extractedContent.Contains("The extra-large size of this phone, even with its great ergonomics, may prove to be"));
              break;

            case 15: // http://www.theverge.com/2012/6/21/3032067/casio-bluetooth-g-shock-watch-gb6900-review
              Assert.IsTrue(extractedContent.Contains("Bank devices in the 80s, but in recent years it hasn't been quite the innovator it once was. "));
              Assert.IsTrue(extractedContent.Contains("the Verge score is based on the average of the subscores below"));
              break;

            case 16:
              Assert.IsTrue(extractedContent.Contains("Header Level 1"));
              Assert.IsTrue(extractedContent.Contains("Header Level 2"));
              Assert.IsTrue(extractedContent.Contains("Header Level 3"));
              Assert.IsTrue(extractedContent.Contains("Header Level 4"));
              Assert.IsTrue(extractedContent.Contains("Header Level 5"));
              Assert.IsTrue(extractedContent.Contains("Header Level 6"));
              break;

            default:
              throw new NotSupportedException("Unknown sample input number (" + sampleInputNumber + "). Have you added another sample input? If so, then add appropriate asserts here as well.");
              }
        }
        public void TestReplacingQueryStringLinkUrls()
        {
            // arrange
              string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>";
              string htmlContent = "<html><body>" + dummyParagraphs + "<p><a href=\"?hello\">link</a></p>" + dummyParagraphs + "</body></html>";

              var transcodingInput =
            new TranscodingInput(htmlContent)
              {
            Url = "http://wikipedia.org/wiki/baseArticle",
              };

              // act
              TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput);

              // assert
              Assert.IsTrue(transcodingResult.ContentExtracted);
              Assert.IsTrue(transcodingResult.ExtractedContent.Contains("href=\"http://wikipedia.org/wiki/baseArticle?hello\""));

              // arrange
              transcodingInput =
            new TranscodingInput(htmlContent)
              {
            Url = "http://wikipedia.org/wiki/baseArticle?goodbye",
              };

              // act
              transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput);

              // assert
              Assert.IsTrue(transcodingResult.ContentExtracted);
              Assert.IsTrue(transcodingResult.ExtractedContent.Contains("href=\"http://wikipedia.org/wiki/baseArticle?hello\""));
        }
Пример #9
0
        protected override void Handle(Page page)
        {
            var elements = page.Selectable.SelectList(Selectors.XPath("//div[@class='result']")).Nodes();
            var results  = new List <BaiduNews>();
            var keyword  = page.Request.Extras.Aggregate("", (current, kv) => string.IsNullOrEmpty(current) ? kv.Value : $"{current},{kv.Value}");

            foreach (var element in elements)
            {
                var title  = element.Select(Selectors.XPath("h3[@class='c-title']/a")).GetValue().Replace("<em>", "").Replace("</em>", "");
                var url    = element.Select(Selectors.XPath("h3[@class='c-title']/a/@href")).GetValue();
                var author = element.Select(Selectors.XPath(".//div/p[@class='c-author']/text()")).GetValue();
                var time   = string.Empty;
                try
                {
                    time = author.Substring(author.IndexOf("&nbsp;&nbsp;", StringComparison.Ordinal) + 12);
                }
                catch (Exception e)
                {
                    Console.WriteLine(e);
                    throw;
                }

                var news = new BaiduNews
                {
                    Keyword = keyword,
                    Title   = title,
                    Time    = time,
                    Url     = url
                };
                page.AddTargetRequest(url, increaseDeep: false);

                results.Add(news);
            }
            page.AddResultItem("News", results);

            if (!results.Any())
            {
                //bool success;
                var transcoder = new NReadabilityTranscoder();
                var input      = new TranscodingInput(page.Content)
                {
                    //DomSerializationParams = new DomSerializationParams()
                    //{
                    //	DontIncludeDocTypeMetaElement = true,
                    //	DontIncludeContentTypeMetaElement = true,
                    //	DontIncludeGeneratorMetaElement = true,
                    //	DontIncludeMobileSpecificMetaElements = true,
                    //	PrettyPrint = true
                    //}
                };
                var text = "";
                try
                {
                    var result   = transcoder.Transcode(input);
                    var document = new HtmlDocument {
                        OptionAutoCloseOnEnd = true
                    };
                    document.LoadHtml(result.ExtractedContent);
                    var node = document.DocumentNode.SelectSingleNode("//div/div/div/div");
                    text = node.InnerText.Trim('\r', '\n', ' ');
                }
                catch (Exception e)
                {
                    Console.WriteLine(e);
                    //throw;
                }

                page.AddResultItem("UpdateNews", new UpdateNews
                {
                    Html = page.Content,
                    Text = text,
                    Url  = page.Url
                });
            }
        }
        public void TestImageSourceTransformer()
        {
            // arrange
              Func<AttributeTransformationInput, AttributeTransformationResult> imgSrcTransformer =
            input =>
            new AttributeTransformationResult
              {
            TransformedValue = string.Format("http://imageresizer.com/u={0}", input.AttributeValue),
            OriginalValueAttributeName = "origsrc",
              };

              string originalSrcValue = "http://example.com/some_image.jpg";
              string expectedSrcValue = imgSrcTransformer.Invoke(new AttributeTransformationInput { AttributeValue = originalSrcValue, Element = null }).TransformedValue;

              string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>";
              string htmlContent = "<html><body>" + dummyParagraphs + "<p><img src=\"" + originalSrcValue + "\" /></p>" + dummyParagraphs + "</body></html>";

              var nReadabilityTranscoder =
            new NReadabilityTranscoder
              {
            ImageSourceTranformer = imgSrcTransformer,
              };

              var transcodingInput =
            new TranscodingInput(htmlContent)
              {
            Url = "http://immortal.pl/",
              };

              // act
              TranscodingResult transcodingResult = nReadabilityTranscoder.Transcode(transcodingInput);

              // assert
              Assert.IsTrue(transcodingResult.ContentExtracted);
              Assert.IsTrue(transcodingResult.ExtractedContent.Contains("src=\"" + expectedSrcValue + "\""));
              Assert.IsTrue(transcodingResult.ExtractedContent.Contains("origsrc=\"" + originalSrcValue + "\""));
        }
        public void TestEmptyArticle()
        {
            // arrange
              const string htmlContent = "<html><body></body></html>";

              var transcodingInput =
            new TranscodingInput(htmlContent)
              {
            Url = "http://wikipedia.org/wiki/baseArticle",
              };

              // act
              TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput);

              // assert
              Assert.IsFalse(transcodingResult.ContentExtracted);
        }
        public void TestAnchorHrefTransformer()
        {
            // arrange
              Func<AttributeTransformationInput, AttributeTransformationResult> anchorHrefTransformer =
            input =>
            new AttributeTransformationResult
              {
            TransformedValue = string.Format("http://redirector.com/u={0}", input.AttributeValue),
            OriginalValueAttributeName = "orighref",
              };

              string originalHrefValue = "http://example.com/some_article.html";
              string expectedHrefValue = anchorHrefTransformer.Invoke(new AttributeTransformationInput { AttributeValue = originalHrefValue, Element = null }).TransformedValue;

              string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>";
              string htmlContent = "<html><body>" + dummyParagraphs + "<p><a href=\"" + originalHrefValue + "\">Some article</a></p>" + dummyParagraphs + "</body></html>";

              var nReadabilityTranscoder =
            new NReadabilityTranscoder
              {
            AnchorHrefTranformer = anchorHrefTransformer,
              };

              var transcodingInput =
            new TranscodingInput(htmlContent)
              {
            Url = "http://immortal.pl/",
              };

              // act
              TranscodingResult transcodingResult = nReadabilityTranscoder.Transcode(transcodingInput);

              // assert
              Assert.IsTrue(transcodingResult.ContentExtracted);
              Assert.IsTrue(transcodingResult.ExtractedContent.Contains("href=\"" + expectedHrefValue + "\""));
              Assert.IsTrue(transcodingResult.ExtractedContent.Contains("orighref=\"" + originalHrefValue + "\""));
        }
        public void Output_contains_meta_generator_element()
        {
            // arrange
              var transcodingInput = new TranscodingInput("test");

              // act
              TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput);

              // assert
              Assert.IsTrue(transcodingResult.ExtractedContent.Contains("meta name=\"Generator\""));
        }
        public void MetaViewportElementShouldBeRemoved()
        {
            // arrange
              const string metaViewportElementStr = "<meta name=\"viewport\" content=\"width=1000\" />";
              const string htmlContent = "<html><head>" + metaViewportElementStr + "</head><body><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p></body></html>";

              var transcodingInput =
            new TranscodingInput(htmlContent)
              {
            Url = "http://wikipedia.org/wiki/baseArticle",
              };

              // act
              TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput);

              // assert
              Assert.IsTrue(transcodingResult.ContentExtracted);
              Assert.IsFalse(transcodingResult.ExtractedContent.Contains(metaViewportElementStr));
        }
        private void TestReplacingImageUrl(string srcAttribute, string url, string expectedImageUrl)
        {
            // arrange
              string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>";
              string htmlContent = "<html><body>" + dummyParagraphs + "<p><img src=\"" + srcAttribute + "\" /></p>" + dummyParagraphs + "</body></html>";

              var transcodingInput =
            new TranscodingInput(htmlContent)
              {
            Url = url,
              };

              // act
              TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput);

              // assert
              Assert.IsTrue(transcodingResult.ContentExtracted);

              Assert.IsTrue(
            transcodingResult.ExtractedContent.Contains("src=\"" + expectedImageUrl + "\""),
            string.Format("Image url replacement failed. Src attribute: {0}, base url: {1}, expected image url: {2}", srcAttribute, url, expectedImageUrl));
        }
        public void TestMobileHeaders()
        {
            // arrange
              string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>";
              string htmlContent = "<html><body>" + dummyParagraphs + "</body></html>";

              var transcodingInput =
            new TranscodingInput(htmlContent)
              {
            Url = "http://wikipedia.org/wiki/baseArticle",
              };

              // act
              TranscodingResult transcodingResult = _nReadabilityTranscoder.Transcode(transcodingInput);

              // assert
              Assert.IsTrue(transcodingResult.ContentExtracted);
              Assert.IsTrue(transcodingResult.ExtractedContent.Contains("<meta name=\"HandheldFriendly\" content=\"true\" />"));
        }
Пример #17
0
        public async Task Dowload(string url, PerformContext context)
        {
            using (var client = new HttpClient())
            {
                client.DefaultRequestHeaders.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36");
                var response = await client.GetAsync(url);

                if (response.StatusCode != HttpStatusCode.OK)
                {
                    return;
                }

                var stream = await response.Content.ReadAsStreamAsync();

                byte[] bytes = new byte[stream.Length];
                await stream.ReadAsync(bytes, 0, bytes.Length);

                var isUTF8 = IsTextUTF8(ref bytes);
                Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
                Encoding encoding;
                if (isUTF8)
                {
                    encoding = Encoding.UTF8;
                }
                else
                {
                    encoding = Encoding.GetEncoding("GBK");
                }

                var html = encoding.GetString(bytes);
                //var document = new HtmlDocument { OptionAutoCloseOnEnd = true };

                //document.LoadHtml(html);
                //foreach (var selectNode in document.DocumentNode.SelectNodes("//meta"))
                //{
                //    if (selectNode.Attributes["http-equiv"]?.Value == "Content-Type")
                //    {
                //        var contentType = selectNode.Attributes["content"].Value;
                //        var match = Regex.Match(contentType, "charset=(?<encoding>[a-zA-Z0-9\\-]*)");
                //        if (match.Success)
                //        {
                //            var encodingName = match.Groups["encoding"].Value;
                //            html = Encoding.GetEncoding(encodingName).GetString(bytes);
                //            break;
                //        }
                //    }

                //    if (selectNode.Attributes["charset"] != null)
                //    {
                //        var encodingName = selectNode.Attributes["charset"].Value;
                //        html = Encoding.GetEncoding(encodingName).GetString(bytes);
                //        break;
                //    }
                //}
                //document.LoadHtml(html);
                //using (var ms = new MemoryStream())
                //using (StreamWriter sw = new StreamWriter(ms, Encoding.UTF8))
                //{
                //    document.Save(sw);
                //    ms.Position = 0;
                //    var xdoc = XDocument.Load(ms);
                //    //using (var sr = new StreamReader(ms))
                //    //{

                //    //    html = await sr.ReadToEndAsync();
                //    //}
                //}

                //var html = await response.Content.ReadAsStringAsync();
                if (string.IsNullOrEmpty(html))
                {
                    return;
                }

                var transcoder = new NReadabilityTranscoder();
                var input      = new TranscodingInput(html);
                try
                {
                    SgmlDomBuilder builder = new SgmlDomBuilder();
                    var            s       = builder.BuildDocument(html);
                    var            result  = transcoder.Transcode(input);

                    var document = new HtmlDocument {
                        OptionAutoCloseOnEnd = true
                    };
                    document.LoadHtml(result.ExtractedContent);
                    var node = document.DocumentNode.SelectSingleNode("//div/div/div/div");
                    var text = node.InnerText.Trim('\r', '\n', ' ', '\t');
                    context.WriteLine("抽取内容为:");
                    context.WriteLine(text);

                    const string cmdText = @"UPDATE [dbo].[BaiduNews] SET [Html]=@Html,[Text]=@Text WHERE [Url]=@Url";

                    await _connection.ExecuteAsync(cmdText, new { Html = html, Text = text, Url = url });

                    await _connection.ExecuteAsync(
                        @"UPDATE a SET a.[NewsCount]=a.[NewsCount]+1 FROM [dbo].[Monitor] a JOIN [dbo].[BaiduNews] b ON a.[Tag]=b.[Keyword] WHERE b.[Url]=@Url",
                        new { Url = url });
                }
                catch (Exception e)
                {
                    context.WriteLine(e);
                }
            }
        }