public void TestImageSourceTransformer()
        {
            Func <AttributeTransformationInput, AttributeTransformationResult> imgSrcTransformer =
                input =>
                new AttributeTransformationResult
            {
                TransformedValue           = string.Format("http://imageresizer.com/u={0}", input.AttributeValue),
                OriginalValueAttributeName = "origsrc",
            };

            string originalSrcValue = "http://example.com/some_image.jpg";
            string expectedSrcValue = imgSrcTransformer.Invoke(new AttributeTransformationInput {
                AttributeValue = originalSrcValue, Element = null
            }).TransformedValue;

            string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>";
            string htmlContent     = "<html><body>" + dummyParagraphs + "<p><img src=\"" + originalSrcValue + "\" /></p>" + dummyParagraphs + "</body></html>";

            var nReadabilityTranscoder =
                new NReadabilityTranscoder
            {
                ImageSourceTranformer = imgSrcTransformer,
            };

            bool   mainContentExtracted;
            string transcodedContent = nReadabilityTranscoder.Transcode(htmlContent, "http://immortal.pl/", out mainContentExtracted);

            Assert.IsTrue(mainContentExtracted);
            Assert.IsTrue(transcodedContent.Contains("src=\"" + expectedSrcValue + "\""));
            Assert.IsTrue(transcodedContent.Contains("origsrc=\"" + originalSrcValue + "\""));
        }
Beispiel #2
0
        public void Download(string url, string targetDir, string targetFname)
        {
            string fname = targetFname.RemoveColon();

            string filepath = Path.Combine(targetDir, fname);

            //ensure it respects mppl
            filepath = Utilities.TrimPathPart(filepath, _futureleanCourse.Max_path_part_len);

            WebHeaderCollection responseHeaders = _futureleanCourse._client.ResponseHeaders;
            int  contentLength = GetContentLength(responseHeaders);
            bool isFileNeeded  = IsFileNeeded(filepath, contentLength, fname);

            if (isFileNeeded)
            {
                if (Path.GetExtension(filepath) == ".html")
                {
                    string content = _futureleanCourse._client.DownloadString(url);
                    NReadabilityTranscoder transcoder        = new NReadabilityTranscoder();
                    TranscodingInput       tiInput           = new TranscodingInput(content);
                    TranscodingResult      transcodedContent = transcoder.Transcode(tiInput);
                    //.Transcode(content, out success);
                    File.WriteAllText(filepath, transcodedContent.ExtractedContent);
                }
                else
                {
                    _futureleanCourse._client.DownloadFile(url, filepath);
                }
            }
        }
Beispiel #3
0
 /// <summary>
 ///  Initializes a new instance of NReadabilityWebTranscoder.
 ///  Allows passing in custom-constructed NReadabilityTranscoder,
 ///  and a custom IUrlFetcher.  This overload is mostly used for testing.
 /// </summary>
 /// <param name="transcoder">A NReadabilityTranscoder.</param>
 /// <param name="urlFetcher">IFetcher instance to download content.</param>
 public NReadabilityWebTranscoder(NReadabilityTranscoder transcoder, IUrlFetcher urlFetcher)
 {
     _transcoder           = transcoder;
     _urlFetcher           = urlFetcher;
     _sgmlDomSerializer    = new SgmlDomSerializer();
     _pageSeparatorBuilder = _DefaultPageSeparatorBuilder;
 }
 /// <summary>
 ///  Initializes a new instance of NReadabilityWebTranscoder.
 ///  Allows passing in custom-constructed NReadabilityTranscoder,
 ///  and a custom IUrlFetcher.
 /// </summary>
 /// <param name="transcoder">A NReadabilityTranscoder.</param>
 /// <param name="urlFetcher">IFetcher instance to download content.</param>
 /// <param name="pageSeparatorBuilder">A function that creates a HTML fragment for page separator. It takes the page number as an argument.</param>
 public NReadabilityWebTranscoder(NReadabilityTranscoder transcoder, IUrlFetcher urlFetcher, Func <int, string> pageSeparatorBuilder)
 {
     _transcoder           = transcoder;
     _urlFetcher           = urlFetcher;
     _sgmlDomSerializer    = new SgmlDomSerializer();
     _pageSeparatorBuilder = pageSeparatorBuilder;
 }
        public void TestAnchorHrefTransformer()
        {
            Func <AttributeTransformationInput, AttributeTransformationResult> anchorHrefTransformer =
                input =>
                new AttributeTransformationResult
            {
                TransformedValue           = string.Format("http://redirector.com/u={0}", input.AttributeValue),
                OriginalValueAttributeName = "orighref",
            };

            string originalHrefValue = "http://example.com/some_article.html";
            string expectedHrefValue = anchorHrefTransformer.Invoke(new AttributeTransformationInput {
                AttributeValue = originalHrefValue, Element = null
            }).TransformedValue;

            string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>";
            string htmlContent     = "<html><body>" + dummyParagraphs + "<p><a href=\"" + originalHrefValue + "\">Some article</a></p>" + dummyParagraphs + "</body></html>";

            var nReadabilityTranscoder =
                new NReadabilityTranscoder
            {
                AnchorHrefTranformer = anchorHrefTransformer,
            };

            bool   mainContentExtracted;
            string transcodedContent = nReadabilityTranscoder.Transcode(htmlContent, "http://immortal.pl/", out mainContentExtracted);

            Assert.IsTrue(mainContentExtracted);
            Assert.IsTrue(transcodedContent.Contains("href=\"" + expectedHrefValue + "\""));
            Assert.IsTrue(transcodedContent.Contains("orighref=\"" + originalHrefValue + "\""));
        }
Beispiel #6
0
        private static String GetWebpageContents(String url)
        {
            var nreadabilityTranscoder = new NReadabilityTranscoder();

            using (var wc = new WebClient())
            {
                var rawHtml          = wc.DownloadString(url);
                var transcodingInput = new TranscodingInput(rawHtml);
                var extractedHtml    = nreadabilityTranscoder.Transcode(transcodingInput).ExtractedContent;
                var pageHtml         = new HtmlDocument();
                pageHtml.LoadHtml(extractedHtml);
                return(pageHtml.DocumentNode.SelectSingleNode("//body").InnerText);
            }
        }
Beispiel #7
0
        /// <summary>
        /// Initializes a new instance of the <see cref="Reader"/> class.
        /// </summary>
        /// <param name="options">The http options.</param>
        public Reader(HttpOptions options)
        {
            // initialize transcoder
            _transcoder = new NReadabilityTranscoder(
                dontStripUnlikelys: false,
                dontNormalizeSpacesInTextContent: true,
                dontWeightClasses: false,
                readingStyle: ReadingStyle.Ebook,
                readingMargin: ReadingMargin.Narrow,
                readingSize: ReadingSize.Medium
                );

            // get default HTTP options if none available
            if (options == null)
            {
                options = HttpOptions.CreateDefault();
            }

            _options = options;

            // initialize custom encoder
            _encoder = new Encodings.Encoder(true);

            // initialize HTTP client
            _httpClient = new HttpClient(options.CustomHttpHandler ?? new HttpClientHandler()
            {
                AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip,
                AllowAutoRedirect      = true
            });

            if (options.RequestTimeout.HasValue)
            {
                _httpClient.Timeout = TimeSpan.FromSeconds(options.RequestTimeout.Value);
            }

            // add accept types
            _httpClient.DefaultRequestHeaders.TryAddWithoutValidation("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");

            // add accepted encodings
            _httpClient.DefaultRequestHeaders.TryAddWithoutValidation("Accept-Encoding", "gzip,deflate");

            // add user agent
            string userAgent = options.UseMobileUserAgent ? options.UserAgentMobile : options.UserAgent;

            string version = typeof(Reader).GetTypeInfo().Assembly.FullName.Split(',')[1].Split('=')[1];

            _httpClient.DefaultRequestHeaders.TryAddWithoutValidation("User-Agent", String.Format(userAgent, "; ReadSharp/" + version));
        }
Beispiel #8
0
    private static void Main(string[] args)
    {
      if (args == null || args.Length != 2)
      {
        DisplayUsage();
        Environment.Exit(1);
      }

      string inputFile = args[0];
      string outputFile = args[1];

      var nReadabilityTranscoder = new NReadabilityTranscoder();
       
      File.WriteAllText(
        outputFile,
        nReadabilityTranscoder.Transcode(new TranscodingInput(File.ReadAllText(inputFile))).ExtractedContent);
    }
Beispiel #9
0
        /// <summary>
        /// Initializes a new instance of the <see cref="Reader" /> class.
        /// </summary>
        /// <param name="options">The HTTP options.</param>
        public Reader(HttpOptions options = null, TranscoderOptions transcoderOptions = null)
        {
            // get default HTTP options if none available
            if (options == null)
            {
                options = HttpOptions.CreateDefault();
            }

            if (transcoderOptions == null)
            {
                transcoderOptions = new TranscoderOptions();
            }

            _options = options;

            _transcoder = CreateTranscoder(transcoderOptions);

            // initialize custom encoder
            _encoder = new Encodings.Encoder(true);

            // initialize HTTP client
            _httpClient = new HttpClient(options.CustomHttpHandler ?? new HttpClientHandler()
            {
                AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip,
                AllowAutoRedirect      = true
            });

            if (options.RequestTimeout.HasValue)
            {
                _httpClient.Timeout = TimeSpan.FromSeconds(options.RequestTimeout.Value);
            }

            // add accept types
            _httpClient.DefaultRequestHeaders.TryAddWithoutValidation("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");

            // add accepted encodings
            _httpClient.DefaultRequestHeaders.TryAddWithoutValidation("Accept-Encoding", "gzip,deflate");

            // add user agent
            string userAgent = options.UseMobileUserAgent ? options.UserAgentMobile : options.UserAgent;

            string version = typeof(Reader).GetTypeInfo().Assembly.FullName.Split(',')[1].Split('=')[1];

            _httpClient.DefaultRequestHeaders.TryAddWithoutValidation("User-Agent", String.Format(userAgent, "; ReadSharp/" + version));
        }
        private static void Main(string[] args)
        {
            if (args == null || args.Length != 2)
            {
                DisplayUsage();
                Environment.Exit(1);
            }

            string inputFile  = args[0];
            string outputFile = args[1];

            var  nReadabilityTranscoder = new NReadabilityTranscoder();
            bool mainContentExtracted;

            File.WriteAllText(
                outputFile,
                nReadabilityTranscoder.Transcode(File.ReadAllText(inputFile), out mainContentExtracted));
        }
Beispiel #11
0
        public async Task <IActionResult> Get([FromQuery] string q, [FromQuery] string e, [FromQuery] string f)
        {
            var    transcoder = new NReadabilityTranscoder();
            string content;

            if (string.IsNullOrEmpty(q))
            {
                return(NotFound());
            }

            try
            {
                using (var wc = new WebClient())
                {
                    wc.Encoding = Encoding.UTF8;
                    content     = wc.DownloadString(q);
                }

                var transcodedContent =
                    transcoder.Transcode(new TranscodingInput(content));

                if (string.IsNullOrEmpty(f) || f != "y")
                {
                    content = transcodedContent.ExtractedContent;
                }

                var posHead = content.IndexOf("<head");
                if (posHead > 0)
                {
                    var endHead = content.IndexOf('>', posHead) + 1;
                    content = content.Insert(endHead, string.Format("<base href='{0}' />", q));
                } // Fix relative path error

                if (!string.IsNullOrEmpty(e))
                {
                    await SendMailAsync(e, transcodedContent.ExtractedTitle, content, q);
                }

                return(Ok(content));
            }catch (Exception ex)
            {
                return(BadRequest(ex.Message));
            }
        }
        public void Transcode_returns_title_if_it_can_be_extracted()
        {
            // arrange
            const string expectedTitle = "Some title ¹ê³ó¿Ÿñæ";
            const string htmlContent   = "<html><head><title>" + expectedTitle + "</title></head><body></body></html>";

            var nReadabilityTranscoder    = new NReadabilityTranscoder();
            var urlFetcher                = new SimpleUrlFetcherStub(htmlContent);
            var nReadabilityWebTranscoder = new NReadabilityWebTranscoder(nReadabilityTranscoder, urlFetcher);

            var webTranscodingInput = new WebTranscodingInput("http://dummy.com/");

            // act
            WebTranscodingResult webTranscodingResult = nReadabilityWebTranscoder.Transcode(webTranscodingInput);

            // assert
            Assert.IsTrue(webTranscodingResult.TitleExtracted);
            Assert.AreEqual(expectedTitle, webTranscodingResult.ExtractedTitle);
        }
Beispiel #13
0
        private static CleanText getCleanText(string url, string content)
        {
            var transcoder = new NReadabilityTranscoder();
            bool success;
            try
            {
            //transcoder.Ti
            TranscodingResult textRes = transcoder.Transcode(new TranscodingInput(content));

            if (textRes.ContentExtracted)
            {
                var title = "";
                if (textRes.TitleExtracted)
                    title = textRes.ExtractedTitle;
                else
                {
                    var titleNode = transcoder.FoundDocument.GetElementsByTagName("title").First();
                    if (titleNode != null)
                        title = titleNode.Value;
                }
                var imgUrl = "";
                var imgNode = transcoder.FoundDocument.GetElementsByTagName("meta").Where(e => e.GetAttributeValue("property", "") == "og:image").First();//doc.SelectSingleNode("//meta[@property='og:image']");
                if (imgNode != null)
                    imgUrl = imgNode.GetAttributeValue("content","");

                var mainText = "";
                if (transcoder.FoundContentElement != null)
                {
                    mainText = transcoder.FoundContentElement.GetInnerHtml();
                }

                return new CleanText { Title = title, Image = imgUrl, Content = mainText, Url = url, FetchDate = DateTime.Now };
            }
            else
            {
                return new CleanText { Title = "#FAIL#", Image = "", Content = "", Url = url, FetchDate = DateTime.Now };
            }
            }
            catch (Exception ex)
            {
            return new CleanText { Title = "#FAIL#", Image = ex.Message, Content = "", Url = url, FetchDate = DateTime.Now };
            }
        }
 /// <summary>
 /// Initializes a new instance of NReadabilityWebTranscoder.
 /// Allows passing in custom-constructed NReadabilityTranscoder.
 /// </summary>
 /// <param name="transcoder">A NReadailityTranscoder.</param>
 public NReadabilityWebTranscoder(NReadabilityTranscoder transcoder)
     : this(transcoder, new UrlFetcher())
 {
 }
Beispiel #15
0
        protected override void Handle(Page page)
        {
            var elements = page.Selectable.SelectList(Selectors.XPath("//div[@class='result']")).Nodes();
            var results  = new List <BaiduNews>();
            var keyword  = page.Request.Extras.Aggregate("", (current, kv) => string.IsNullOrEmpty(current) ? kv.Value : $"{current},{kv.Value}");

            foreach (var element in elements)
            {
                var title  = element.Select(Selectors.XPath("h3[@class='c-title']/a")).GetValue().Replace("<em>", "").Replace("</em>", "");
                var url    = element.Select(Selectors.XPath("h3[@class='c-title']/a/@href")).GetValue();
                var author = element.Select(Selectors.XPath(".//div/p[@class='c-author']/text()")).GetValue();
                var time   = string.Empty;
                try
                {
                    time = author.Substring(author.IndexOf("&nbsp;&nbsp;", StringComparison.Ordinal) + 12);
                }
                catch (Exception e)
                {
                    Console.WriteLine(e);
                    throw;
                }

                var news = new BaiduNews
                {
                    Keyword = keyword,
                    Title   = title,
                    Time    = time,
                    Url     = url
                };
                page.AddTargetRequest(url, increaseDeep: false);

                results.Add(news);
            }
            page.AddResultItem("News", results);

            if (!results.Any())
            {
                //bool success;
                var transcoder = new NReadabilityTranscoder();
                var input      = new TranscodingInput(page.Content)
                {
                    //DomSerializationParams = new DomSerializationParams()
                    //{
                    //	DontIncludeDocTypeMetaElement = true,
                    //	DontIncludeContentTypeMetaElement = true,
                    //	DontIncludeGeneratorMetaElement = true,
                    //	DontIncludeMobileSpecificMetaElements = true,
                    //	PrettyPrint = true
                    //}
                };
                var text = "";
                try
                {
                    var result   = transcoder.Transcode(input);
                    var document = new HtmlDocument {
                        OptionAutoCloseOnEnd = true
                    };
                    document.LoadHtml(result.ExtractedContent);
                    var node = document.DocumentNode.SelectSingleNode("//div/div/div/div");
                    text = node.InnerText.Trim('\r', '\n', ' ');
                }
                catch (Exception e)
                {
                    Console.WriteLine(e);
                    //throw;
                }

                page.AddResultItem("UpdateNews", new UpdateNews
                {
                    Html = page.Content,
                    Text = text,
                    Url  = page.Url
                });
            }
        }
 /// <summary>
 ///  Initializes a new instance of NReadabilityWebTranscoder.
 ///  Allows passing in custom-constructed NReadabilityTranscoder,
 ///  and a custom IUrlFetcher.
 /// </summary>
 /// <param name="transcoder">A NReadabilityTranscoder.</param>
 /// <param name="urlFetcher">IFetcher instance to download content.</param>
 public NReadabilityWebTranscoder(NReadabilityTranscoder transcoder, IUrlFetcher urlFetcher)
     : this(transcoder, urlFetcher, _DefaultPageSeparatorBuilder)
 {
 }
Beispiel #17
0
        public void TestSampleInputs([Values(1, 2, 3, 4, 5, 6, 7, 8)] int sampleInputNumber)
        {
            const string outputDir = "SampleWebOutput";

            string sampleInputNumberStr = sampleInputNumber.ToString().PadLeft(2, '0');

            string[] urls       = _Urls[sampleInputNumber - 1];
            string   initialUrl = urls[0];

            var fetcher = new FileBasedUrlFetcherStub(sampleInputNumber, urls);
            var _nReadabilityTranscoder    = new NReadabilityTranscoder();
            var _nReadabilityWebTranscoder = new NReadabilityWebTranscoder(_nReadabilityTranscoder, fetcher);

            bool mainContentExtracted;

            string transcodedContent =
                _nReadabilityWebTranscoder
                .Transcode(
                    initialUrl,
                    out mainContentExtracted);

            if (!Directory.Exists(outputDir))
            {
                Directory.CreateDirectory(outputDir);
            }

            File.WriteAllText(
                Path.Combine(outputDir, string.Format("SampleOutput_{0}.html", sampleInputNumberStr)),
                transcodedContent,
                Encoding.UTF8);

            switch (sampleInputNumber)
            {
            case 1:
                Assert.IsTrue(transcodedContent.Contains(" freedom of movement or expression would constitute a new and unacceptable denial"));
                Assert.IsTrue(transcodedContent.Contains("Those expectations were on display in the crowd outside her house on Saturday."));
                Assert.That(Regex.Matches(transcodedContent, "Myanmar Junta Frees Dissident Daw Aung San Suu Kyi").Count, Is.EqualTo(4));
                break;

            case 2:
                Assert.IsTrue(transcodedContent.Contains("For Louie and Phil, the conversations did more than keep their minds sharp."));
                Assert.IsTrue(transcodedContent.Contains("It was absolutely dark and absolutely silent, save for the chattering of Phil’s teeth."));
                Assert.IsTrue(transcodedContent.Contains("A serial runaway and artful dodger"));
                Assert.That(Regex.Matches(transcodedContent, @"Adrift but Unbroken \| Politics").Count, Is.EqualTo(2));
                break;

            case 3:
                Assert.IsTrue(transcodedContent.Contains("The Chinese system as a whole has great weaknesses as well as great strengths."));
                Assert.IsTrue(transcodedContent.Contains(" This emphasis on limits is what begins pointing us back to coal."));
                Assert.IsTrue(transcodedContent.Contains(". For example, the possibility of dramatic rises in ocean levels, which could affect the habitability"));
                Assert.That(Regex.Matches(transcodedContent, "Dirty Coal, Clean Future - Magazine").Count, Is.EqualTo(3)); // Makes sure the title isn't duplicated
                break;

            case 4: // Test duplicate content on subsequent page
                Assert.That(Regex.Matches(transcodedContent, "his may seem paradoxical, or backward").Count, Is.EqualTo(1));
                break;

            case 5:
                // page 1
                Assert.IsTrue(transcodedContent.Contains("The pressure's on, and as you glance back and forth between your rack and the board, you can hardly believe your eyes at the play you can make."));
                Assert.IsTrue(transcodedContent.Contains("How can you take your game to the next level? Let's start by looking at game play."));
                // page 2
                Assert.IsTrue(transcodedContent.Contains("The object of Scrabble is to get the most points by creating words."));
                Assert.IsTrue(transcodedContent.Contains("Now that you know the parts of the game, let's take a look at how to play it."));
                // page 3
                Assert.IsTrue(transcodedContent.Contains("To determine who goes first, put all the tiles into the bag and mix them up."));
                Assert.IsTrue(transcodedContent.Contains("The game continues until one player uses all of his tiles and there aren't any in the pouch, or if there are no more tiles and no one can make a word. Add up the total of your unplayed tiles and deduct it from your score. If you've used all of your tiles, add the total of the unplayed tiles to your score. The winner has the most points."));
                // page 4
                Assert.IsTrue(transcodedContent.Contains("If you play often enough, you'll need to learn how to play the board in order to get the highest score"));
                Assert.IsTrue(transcodedContent.Contains("With the game's popularity, it now comes in many variations. Let's take a look at some different ways to play Scrabble."));
                // page 5
                Assert.IsTrue(transcodedContent.Contains("Many people play Scrabble on a traditional flat board with the grid imprinted on it."));
                Assert.IsTrue(transcodedContent.Contains("With its worldwide popularity, it only makes sense that Scrabble comes in languages other than English. "));
                break;

            case 6:
                // page 1
                Assert.IsTrue(transcodedContent.Contains("In the aftermath of his resignation and then his death"));
                Assert.IsTrue(transcodedContent.Contains("Curb Your Enthusiasm"));
                // page 2
                Assert.IsTrue(transcodedContent.Contains("Jobs also seemed to suspect that he"));
                Assert.IsTrue(transcodedContent.Contains("And, sadly, it may remain one forever."));
                break;

            case 7:
                // page 1
                Assert.IsTrue(transcodedContent.Contains("post also betrays some misconceptions regarding our report."));
                Assert.IsTrue(transcodedContent.Contains("After all, none of us can resist the occasional study"));
                // "page" 2 (false positive)
                Assert.IsFalse(transcodedContent.Contains("In expressing this view, Clinton joins many Americans who worry about online misinformation, loss of privacy, and identity theft."));
                break;

            case 8:
                // page 1
                Assert.IsTrue(transcodedContent.Contains("For the last couple of days we've been asking people"));
                Assert.IsTrue(transcodedContent.Contains("list your favorite tools for slowing down feeds in the comments"));
                // "page" 2 (false positive)
                Assert.IsFalse(transcodedContent.Contains("signature fake news programs"));
                break;

            default:
                throw new NotSupportedException("Unknown sample input number (" + sampleInputNumber + "). Have you added another sample input? If so, then add appropriate asserts here as well.");
            }

            Assert.IsTrue(mainContentExtracted);
        }
        public void TranscodeXml()
        {
            string content = File.ReadAllText(@"SampleInput\SampleInput_01.html");
            var transcoder = new NReadabilityTranscoder();
            bool mainContentExtracted;
            string title;
            string nextPage;
            var doc = transcoder.TranscodeToXml(content, "http://media3.washingtonpost.com/", out mainContentExtracted, out title, out nextPage);

            var element = doc.GetElementById(NReadabilityTranscoder.InnerDivId);

            //var sgmlDomSerializer = new SgmlDomSerializer();
            var extractedContent = element.ToString(SaveOptions.None);

            // washingtonpost.com - "Court Puts Off Decision On Indefinite Detention"
            Assert.AreEqual("Court Puts Off Decision On Indefinite Detention", title);
            Assert.IsTrue(extractedContent.Contains("The Supreme Court yesterday vacated a lower"));
            Assert.IsTrue(extractedContent.Contains("The justices did not rule on the merits"));
            Assert.IsTrue(extractedContent.Contains("But the government said the issues were now"));

            Assert.IsFalse(extractedContent.Contains("<html>"));
            Assert.IsFalse(extractedContent.Contains("<body>"));
        }
 public void SetUp()
 {
     _nReadabilityTranscoder = new NReadabilityTranscoder();
 }
        public void TestImageSourceTransformer()
        {
            // arrange
              Func<AttributeTransformationInput, AttributeTransformationResult> imgSrcTransformer =
            input =>
            new AttributeTransformationResult
              {
            TransformedValue = string.Format("http://imageresizer.com/u={0}", input.AttributeValue),
            OriginalValueAttributeName = "origsrc",
              };

              string originalSrcValue = "http://example.com/some_image.jpg";
              string expectedSrcValue = imgSrcTransformer.Invoke(new AttributeTransformationInput { AttributeValue = originalSrcValue, Element = null }).TransformedValue;

              string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>";
              string htmlContent = "<html><body>" + dummyParagraphs + "<p><img src=\"" + originalSrcValue + "\" /></p>" + dummyParagraphs + "</body></html>";

              var nReadabilityTranscoder =
            new NReadabilityTranscoder
              {
            ImageSourceTranformer = imgSrcTransformer,
              };

              var transcodingInput =
            new TranscodingInput(htmlContent)
              {
            Url = "http://immortal.pl/",
              };

              // act
              TranscodingResult transcodingResult = nReadabilityTranscoder.Transcode(transcodingInput);

              // assert
              Assert.IsTrue(transcodingResult.ContentExtracted);
              Assert.IsTrue(transcodingResult.ExtractedContent.Contains("src=\"" + expectedSrcValue + "\""));
              Assert.IsTrue(transcodingResult.ExtractedContent.Contains("origsrc=\"" + originalSrcValue + "\""));
        }
        public void TestAnchorHrefTransformer()
        {
            // arrange
              Func<AttributeTransformationInput, AttributeTransformationResult> anchorHrefTransformer =
            input =>
            new AttributeTransformationResult
              {
            TransformedValue = string.Format("http://redirector.com/u={0}", input.AttributeValue),
            OriginalValueAttributeName = "orighref",
              };

              string originalHrefValue = "http://example.com/some_article.html";
              string expectedHrefValue = anchorHrefTransformer.Invoke(new AttributeTransformationInput { AttributeValue = originalHrefValue, Element = null }).TransformedValue;

              string dummyParagraphs = "<p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p><p>Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet. Lorem ipsum dolor et amet.</p>";
              string htmlContent = "<html><body>" + dummyParagraphs + "<p><a href=\"" + originalHrefValue + "\">Some article</a></p>" + dummyParagraphs + "</body></html>";

              var nReadabilityTranscoder =
            new NReadabilityTranscoder
              {
            AnchorHrefTranformer = anchorHrefTransformer,
              };

              var transcodingInput =
            new TranscodingInput(htmlContent)
              {
            Url = "http://immortal.pl/",
              };

              // act
              TranscodingResult transcodingResult = nReadabilityTranscoder.Transcode(transcodingInput);

              // assert
              Assert.IsTrue(transcodingResult.ContentExtracted);
              Assert.IsTrue(transcodingResult.ExtractedContent.Contains("href=\"" + expectedHrefValue + "\""));
              Assert.IsTrue(transcodingResult.ExtractedContent.Contains("orighref=\"" + originalHrefValue + "\""));
        }
 public void SetUp()
 {
     _nReadabilityTranscoder = new NReadabilityTranscoder();
 }
        public void TestSampleInputs([Values(1, 2, 3, 4, 5, 6, 7, 8)]int sampleInputNumber)
        {
            const string outputDir = "SampleWebOutput";

              string sampleInputNumberStr = sampleInputNumber.ToString().PadLeft(2, '0');
              string[] urls = _Urls[sampleInputNumber - 1];
              string initialUrl = urls[0];

              var fetcher = new UrlFetcherStub(sampleInputNumber, urls);
              var _nReadabilityTranscoder = new NReadabilityTranscoder();
              var _nReadabilityWebTranscoder = new NReadabilityWebTranscoder(_nReadabilityTranscoder, fetcher);

              bool mainContentExtracted;

              string transcodedContent =
            _nReadabilityWebTranscoder
              .Transcode(
            initialUrl,
            out mainContentExtracted);

              if (!Directory.Exists(outputDir))
              {
            Directory.CreateDirectory(outputDir);
              }

              File.WriteAllText(
            Path.Combine(outputDir, string.Format("SampleOutput_{0}.html", sampleInputNumberStr)),
            transcodedContent,
            Encoding.UTF8);

              switch (sampleInputNumber)
              {
            case 1:
              Assert.IsTrue(transcodedContent.Contains(" freedom of movement or expression would constitute a new and unacceptable denial"));
              Assert.IsTrue(transcodedContent.Contains("Those expectations were on display in the crowd outside her house on Saturday."));
              Assert.That(Regex.Matches(transcodedContent, "Myanmar Junta Frees Dissident Daw Aung San Suu Kyi").Count, Is.EqualTo(4));
              break;

            case 2:
              Assert.IsTrue(transcodedContent.Contains("For Louie and Phil, the conversations did more than keep their minds sharp."));
              Assert.IsTrue(transcodedContent.Contains("It was absolutely dark and absolutely silent, save for the chattering of Phil’s teeth."));
              Assert.IsTrue(transcodedContent.Contains("A serial runaway and artful dodger"));
              Assert.That(Regex.Matches(transcodedContent, @"Adrift but Unbroken \| Politics").Count, Is.EqualTo(2));
              break;

            case 3:
              Assert.IsTrue(transcodedContent.Contains("The Chinese system as a whole has great weaknesses as well as great strengths."));
              Assert.IsTrue(transcodedContent.Contains(" This emphasis on limits is what begins pointing us back to coal."));
              Assert.IsTrue(transcodedContent.Contains(". For example, the possibility of dramatic rises in ocean levels, which could affect the habitability"));
              Assert.That(Regex.Matches(transcodedContent, "Dirty Coal, Clean Future - Magazine").Count, Is.EqualTo(3)); // Makes sure the title isn't duplicated
              break;

            case 4:  // Test duplicate content on subsequent page
              Assert.That(Regex.Matches(transcodedContent, "his may seem paradoxical, or backward").Count, Is.EqualTo(1));
              break;

            case 5:
              // page 1
              Assert.IsTrue(transcodedContent.Contains("The pressure's on, and as you glance back and forth between your rack and the board, you can hardly believe your eyes at the play you can make."));
              Assert.IsTrue(transcodedContent.Contains("How can you take your game to the next level? Let's start by looking at game play."));
              // page 2
              Assert.IsTrue(transcodedContent.Contains("The object of Scrabble is to get the most points by creating words."));
              Assert.IsTrue(transcodedContent.Contains("Now that you know the parts of the game, let's take a look at how to play it."));
              // page 3
              Assert.IsTrue(transcodedContent.Contains("To determine who goes first, put all the tiles into the bag and mix them up."));
              Assert.IsTrue(transcodedContent.Contains("The game continues until one player uses all of his tiles and there aren't any in the pouch, or if there are no more tiles and no one can make a word. Add up the total of your unplayed tiles and deduct it from your score. If you've used all of your tiles, add the total of the unplayed tiles to your score. The winner has the most points."));
              // page 4
              Assert.IsTrue(transcodedContent.Contains("If you play often enough, you'll need to learn how to play the board in order to get the highest score"));
              Assert.IsTrue(transcodedContent.Contains("With the game's popularity, it now comes in many variations. Let's take a look at some different ways to play Scrabble."));
              // page 5
              Assert.IsTrue(transcodedContent.Contains("Many people play Scrabble on a traditional flat board with the grid imprinted on it."));
              Assert.IsTrue(transcodedContent.Contains("With its worldwide popularity, it only makes sense that Scrabble comes in languages other than English. "));
              break;

            case 6:
              // page 1
              Assert.IsTrue(transcodedContent.Contains("In the aftermath of his resignation and then his death"));
              Assert.IsTrue(transcodedContent.Contains("Curb Your Enthusiasm"));
              // page 2
              Assert.IsTrue(transcodedContent.Contains("Jobs also seemed to suspect that he"));
              Assert.IsTrue(transcodedContent.Contains("And, sadly, it may remain one forever."));
              break;

            case 7:
              // page 1
              Assert.IsTrue(transcodedContent.Contains("post also betrays some misconceptions regarding our report."));
              Assert.IsTrue(transcodedContent.Contains("After all, none of us can resist the occasional study"));
              // "page" 2 (false positive)
              Assert.IsFalse(transcodedContent.Contains("In expressing this view, Clinton joins many Americans who worry about online misinformation, loss of privacy, and identity theft."));
              break;

            case 8:
              // page 1
              Assert.IsTrue(transcodedContent.Contains("For the last couple of days we’ve been asking people"));
              Assert.IsTrue(transcodedContent.Contains("list your favorite tools for slowing down feeds in the comments"));
              // "page" 2 (false positive)
              Assert.IsFalse(transcodedContent.Contains("signature fake news programs"));
              break;

            default:
              throw new NotSupportedException("Unknown sample input number (" + sampleInputNumber + "). Have you added another sample input? If so, then add appropriate asserts here as well.");
              }

              Assert.IsTrue(mainContentExtracted);
        }
        public void TestSampleInputs([Values(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)]int sampleInputNumber)
        {
            const string outputDir = "SampleWebOutput";

              string sampleInputNumberStr = sampleInputNumber.ToString().PadLeft(2, '0');
              string[] urls = _Urls[sampleInputNumber];
              string initialUrl = urls[0];

              var fetcher = new FileBasedUrlFetcherStub(sampleInputNumber, urls);
              var nReadabilityTranscoder = new NReadabilityTranscoder();
              var nReadabilityWebTranscoder = new NReadabilityWebTranscoder(nReadabilityTranscoder, fetcher);

              var webTranscodingInput = new WebTranscodingInput(initialUrl);

              WebTranscodingResult webTranscodingResult = nReadabilityWebTranscoder.Transcode(webTranscodingInput);

              Assert.IsTrue(webTranscodingResult.ContentExtracted);

              if (!Directory.Exists(outputDir))
              {
            Directory.CreateDirectory(outputDir);
              }

              string extractedContent = webTranscodingResult.ExtractedContent;

              File.WriteAllText(
            Path.Combine(outputDir, string.Format("SampleOutput_{0}.html", sampleInputNumberStr)),
            extractedContent,
            Encoding.UTF8);

              Assert.IsTrue(extractedContent.IndexOf("<html") == extractedContent.LastIndexOf("<html"));
              Assert.IsTrue(extractedContent.IndexOf("</html") == extractedContent.LastIndexOf("</html"));

              switch (sampleInputNumber)
              {
            case 1:
              Assert.IsTrue(extractedContent.Contains(" freedom of movement or expression would constitute a new and unacceptable denial"));
              Assert.IsTrue(extractedContent.Contains("Those expectations were on display in the crowd outside her house on Saturday."));
              Assert.That(Regex.Matches(extractedContent, "Myanmar Junta Frees Dissident Daw Aung San Suu Kyi").Count, Is.EqualTo(4));
              break;

            case 2:
              Assert.IsTrue(extractedContent.Contains("For Louie and Phil, the conversations did more than keep their minds sharp."));
              Assert.IsTrue(extractedContent.Contains("It was absolutely dark and absolutely silent, save for the chattering of Phil�s teeth."));
              Assert.IsTrue(extractedContent.Contains("A serial runaway and artful dodger"));
              Assert.That(Regex.Matches(extractedContent, @"Adrift but Unbroken \| Politics").Count, Is.EqualTo(2));
              break;

            case 3:
              Assert.IsTrue(extractedContent.Contains("The Chinese system as a whole has great weaknesses as well as great strengths."));
              Assert.IsTrue(extractedContent.Contains(" This emphasis on limits is what begins pointing us back to coal."));
              Assert.IsTrue(extractedContent.Contains(". For example, the possibility of dramatic rises in ocean levels, which could affect the habitability"));
              Assert.That(Regex.Matches(extractedContent, "Dirty Coal, Clean Future - Magazine").Count, Is.EqualTo(3)); // Makes sure the title isn't duplicated
              break;

            case 4:  // Test duplicate content on subsequent page
              Assert.That(Regex.Matches(extractedContent, "his may seem paradoxical, or backward").Count, Is.EqualTo(1));
              break;

            case 5:
              // page 1
              Assert.IsTrue(extractedContent.Contains("The pressure's on, and as you glance back and forth between your rack and the board, you can hardly believe your eyes at the play you can make."));
              Assert.IsTrue(extractedContent.Contains("How can you take your game to the next level? Let's start by looking at game play."));
              // page 2
              Assert.IsTrue(extractedContent.Contains("The object of Scrabble is to get the most points by creating words."));
              Assert.IsTrue(extractedContent.Contains("Now that you know the parts of the game, let's take a look at how to play it."));
              // page 3
              Assert.IsTrue(extractedContent.Contains("To determine who goes first, put all the tiles into the bag and mix them up."));
              Assert.IsTrue(extractedContent.Contains("The game continues until one player uses all of his tiles and there aren't any in the pouch, or if there are no more tiles and no one can make a word. Add up the total of your unplayed tiles and deduct it from your score. If you've used all of your tiles, add the total of the unplayed tiles to your score. The winner has the most points."));
              // page 4
              Assert.IsTrue(extractedContent.Contains("If you play often enough, you'll need to learn how to play the board in order to get the highest score"));
              Assert.IsTrue(extractedContent.Contains("With the game's popularity, it now comes in many variations. Let's take a look at some different ways to play Scrabble."));
              // page 5
              Assert.IsTrue(extractedContent.Contains("Many people play Scrabble on a traditional flat board with the grid imprinted on it."));
              Assert.IsTrue(extractedContent.Contains("With its worldwide popularity, it only makes sense that Scrabble comes in languages other than English. "));
              break;

            case 6:
              // page 1
              Assert.IsTrue(extractedContent.Contains("In the aftermath of his resignation and then his death"));
              Assert.IsTrue(extractedContent.Contains("Curb Your Enthusiasm"));
              // page 2
              Assert.IsTrue(extractedContent.Contains("Jobs also seemed to suspect that he"));
              Assert.IsTrue(extractedContent.Contains("And, sadly, it may remain one forever."));
              break;

            case 7:
              // page 1
              Assert.IsTrue(extractedContent.Contains("post also betrays some misconceptions regarding our report."));
              Assert.IsTrue(extractedContent.Contains("After all, none of us can resist the occasional study"));
              // "page" 2 (false positive)
              Assert.IsFalse(extractedContent.Contains("In expressing this view, Clinton joins many Americans who worry about online misinformation, loss of privacy, and identity theft."));
              break;

            case 8:
              // page 1
              Assert.IsTrue(extractedContent.Contains("For the last couple of days we've been asking people"));
              Assert.IsTrue(extractedContent.Contains("list your favorite tools for slowing down feeds in the comments"));
              // "page" 2 (false positive)
              Assert.IsFalse(extractedContent.Contains("signature fake news programs"));
              break;

            case 9:
              // page 1
              Assert.IsTrue(extractedContent.Contains("The story is narrated by a young girl named Jean Louise"));
              Assert.IsTrue(extractedContent.Contains("toward adulthood."));
              // page 2
              Assert.IsTrue(extractedContent.Contains("September arrives, and Dill leaves Maycomb to return to"));
              Assert.IsTrue(extractedContent.Contains("educational technique but the law."));
              break;

            case 10:
              // page 1
              Assert.IsTrue(extractedContent.Contains("he fire at the Triangle Waist Company"));
              Assert.IsTrue(extractedContent.Contains("at the hands of industrial greed."));
              // page 2
              Assert.IsTrue(extractedContent.Contains("he Triangle Waist Company was in many ways"));
              Assert.IsTrue(extractedContent.Contains("unsafe working conditions on their employees."));
              // page 3 (last)
              Assert.IsTrue(extractedContent.Contains("mmediately after the fire, Triangle owners Blanck and Harris"));
              Assert.IsTrue(extractedContent.Contains("and that it was \"second to none in the country.\""));
              break;

            case 11:
              Assert.IsTrue(extractedContent.Contains("More than 20 percent of the world�s oxygen comes from the Amazon Rainforest."));
              Assert.IsTrue(extractedContent.Contains("practical ways to shrink the size of your step."));
              break;

            case 12:
              // Actual tumblr post
              Assert.IsTrue(extractedContent.Contains("First of all, you should watch this video."));
              // Next tumlbr post, linked from first - should not be included
              Assert.IsFalse(extractedContent.Contains("I�ll let Neil deGrasse Tyson set this up"));
              break;

            case 13:
              Assert.IsTrue(extractedContent.Contains("Back in 2003"));
              break;

            case 14:
              Assert.IsFalse(extractedContent.Contains("</body><a"), "Content found after </body>");
              break;

            case 15:
              Assert.IsFalse(extractedContent.Contains("</body><header>"), "Content found after </body>");
              break;

            case 16:
              {
            string sample = "It's the first day of school";
            int bodyStart = extractedContent.IndexOf("<body");
            int firstPageStart = extractedContent.IndexOf(sample, bodyStart);
            Assert.IsTrue(firstPageStart > -1);
            Assert.AreEqual(-1, extractedContent.IndexOf(sample, firstPageStart + sample.Length), "Article body repeated due to comment paging");
              }
              break;

            case 17:
              {
            string sample = "everybody should be treated equally";
            int firstPageStart = extractedContent.IndexOf(sample);
            Assert.IsTrue(firstPageStart > -1);
            Assert.AreEqual(-1, extractedContent.IndexOf(sample, firstPageStart + sample.Length), "Article body repeated due to conditional comment parsing");
              }
              break;

            case 18:
              {
            Assert.IsTrue(extractedContent.Contains("When Ben Franklin wrote"), "Missing start of text");
              }
              break;

            default:
              throw new NotSupportedException("Unknown sample input number (" + sampleInputNumber + "). Have you added another sample input? If so, then add appropriate asserts here as well.");
              }
        }
Beispiel #25
0
        public void TestSampleInputs([Values(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)] int sampleInputNumber)
        {
            const string outputDir = "SampleWebOutput";

            string sampleInputNumberStr = sampleInputNumber.ToString().PadLeft(2, '0');

            string[] urls       = _Urls[sampleInputNumber];
            string   initialUrl = urls[0];

            var fetcher = new FileBasedUrlFetcherStub(sampleInputNumber, urls);
            var nReadabilityTranscoder    = new NReadabilityTranscoder();
            var nReadabilityWebTranscoder = new NReadabilityWebTranscoder(nReadabilityTranscoder, fetcher);

            var webTranscodingInput = new WebTranscodingInput(initialUrl);

            WebTranscodingResult webTranscodingResult = nReadabilityWebTranscoder.Transcode(webTranscodingInput);

            Assert.IsTrue(webTranscodingResult.ContentExtracted);

            if (!Directory.Exists(outputDir))
            {
                Directory.CreateDirectory(outputDir);
            }

            string extractedContent = webTranscodingResult.ExtractedContent;

            File.WriteAllText(
                Path.Combine(outputDir, string.Format("SampleOutput_{0}.html", sampleInputNumberStr)),
                extractedContent,
                Encoding.UTF8);

            Assert.IsTrue(extractedContent.IndexOf("<html") == extractedContent.LastIndexOf("<html"));
            Assert.IsTrue(extractedContent.IndexOf("</html") == extractedContent.LastIndexOf("</html"));

            switch (sampleInputNumber)
            {
            case 1:
                Assert.IsTrue(extractedContent.Contains(" freedom of movement or expression would constitute a new and unacceptable denial"));
                Assert.IsTrue(extractedContent.Contains("Those expectations were on display in the crowd outside her house on Saturday."));
                Assert.That(Regex.Matches(extractedContent, "Myanmar Junta Frees Dissident Daw Aung San Suu Kyi").Count, Is.EqualTo(4));
                break;

            case 2:
                Assert.IsTrue(extractedContent.Contains("For Louie and Phil, the conversations did more than keep their minds sharp."));
                Assert.IsTrue(extractedContent.Contains("It was absolutely dark and absolutely silent, save for the chattering of Phil’s teeth."));
                Assert.IsTrue(extractedContent.Contains("A serial runaway and artful dodger"));
                Assert.That(Regex.Matches(extractedContent, @"Adrift but Unbroken \| Politics").Count, Is.EqualTo(2));
                break;

            case 3:
                Assert.IsTrue(extractedContent.Contains("The Chinese system as a whole has great weaknesses as well as great strengths."));
                Assert.IsTrue(extractedContent.Contains(" This emphasis on limits is what begins pointing us back to coal."));
                Assert.IsTrue(extractedContent.Contains(". For example, the possibility of dramatic rises in ocean levels, which could affect the habitability"));
                Assert.That(Regex.Matches(extractedContent, "Dirty Coal, Clean Future - Magazine").Count, Is.EqualTo(3)); // Makes sure the title isn't duplicated
                break;

            case 4: // Test duplicate content on subsequent page
                Assert.That(Regex.Matches(extractedContent, "his may seem paradoxical, or backward").Count, Is.EqualTo(1));
                break;

            case 5:
                // page 1
                Assert.IsTrue(extractedContent.Contains("The pressure's on, and as you glance back and forth between your rack and the board, you can hardly believe your eyes at the play you can make."));
                Assert.IsTrue(extractedContent.Contains("How can you take your game to the next level? Let's start by looking at game play."));
                // page 2
                Assert.IsTrue(extractedContent.Contains("The object of Scrabble is to get the most points by creating words."));
                Assert.IsTrue(extractedContent.Contains("Now that you know the parts of the game, let's take a look at how to play it."));
                // page 3
                Assert.IsTrue(extractedContent.Contains("To determine who goes first, put all the tiles into the bag and mix them up."));
                Assert.IsTrue(extractedContent.Contains("The game continues until one player uses all of his tiles and there aren't any in the pouch, or if there are no more tiles and no one can make a word. Add up the total of your unplayed tiles and deduct it from your score. If you've used all of your tiles, add the total of the unplayed tiles to your score. The winner has the most points."));
                // page 4
                Assert.IsTrue(extractedContent.Contains("If you play often enough, you'll need to learn how to play the board in order to get the highest score"));
                Assert.IsTrue(extractedContent.Contains("With the game's popularity, it now comes in many variations. Let's take a look at some different ways to play Scrabble."));
                // page 5
                Assert.IsTrue(extractedContent.Contains("Many people play Scrabble on a traditional flat board with the grid imprinted on it."));
                Assert.IsTrue(extractedContent.Contains("With its worldwide popularity, it only makes sense that Scrabble comes in languages other than English. "));
                break;

            case 6:
                // page 1
                Assert.IsTrue(extractedContent.Contains("In the aftermath of his resignation and then his death"));
                Assert.IsTrue(extractedContent.Contains("Curb Your Enthusiasm"));
                // page 2
                Assert.IsTrue(extractedContent.Contains("Jobs also seemed to suspect that he"));
                Assert.IsTrue(extractedContent.Contains("And, sadly, it may remain one forever."));
                break;

            case 7:
                // page 1
                Assert.IsTrue(extractedContent.Contains("post also betrays some misconceptions regarding our report."));
                Assert.IsTrue(extractedContent.Contains("After all, none of us can resist the occasional study"));
                // "page" 2 (false positive)
                Assert.IsFalse(extractedContent.Contains("In expressing this view, Clinton joins many Americans who worry about online misinformation, loss of privacy, and identity theft."));
                break;

            case 8:
                // page 1
                Assert.IsTrue(extractedContent.Contains("For the last couple of days we've been asking people"));
                Assert.IsTrue(extractedContent.Contains("list your favorite tools for slowing down feeds in the comments"));
                // "page" 2 (false positive)
                Assert.IsFalse(extractedContent.Contains("signature fake news programs"));
                break;

            case 9:
                // page 1
                Assert.IsTrue(extractedContent.Contains("The story is narrated by a young girl named Jean Louise"));
                Assert.IsTrue(extractedContent.Contains("toward adulthood."));
                // page 2
                Assert.IsTrue(extractedContent.Contains("September arrives, and Dill leaves Maycomb to return to"));
                Assert.IsTrue(extractedContent.Contains("educational technique but the law."));
                break;

            case 10:
                // page 1
                Assert.IsTrue(extractedContent.Contains("he fire at the Triangle Waist Company"));
                Assert.IsTrue(extractedContent.Contains("at the hands of industrial greed."));
                // page 2
                Assert.IsTrue(extractedContent.Contains("he Triangle Waist Company was in many ways"));
                Assert.IsTrue(extractedContent.Contains("unsafe working conditions on their employees."));
                // page 3 (last)
                Assert.IsTrue(extractedContent.Contains("mmediately after the fire, Triangle owners Blanck and Harris"));
                Assert.IsTrue(extractedContent.Contains("and that it was \"second to none in the country.\""));
                break;

            case 11:
                Assert.IsTrue(extractedContent.Contains("More than 20 percent of the world’s oxygen comes from the Amazon Rainforest."));
                Assert.IsTrue(extractedContent.Contains("practical ways to shrink the size of your step."));
                break;

            case 12:
                // Actual tumblr post
                Assert.IsTrue(extractedContent.Contains("First of all, you should watch this video."));
                // Next tumlbr post, linked from first - should not be included
                Assert.IsFalse(extractedContent.Contains("I’ll let Neil deGrasse Tyson set this up"));
                break;

            case 13:
                Assert.IsTrue(extractedContent.Contains("Back in 2003"));
                break;

            case 14:
                Assert.IsFalse(extractedContent.Contains("</body><a"), "Content found after </body>");
                break;

            case 15:
                Assert.IsFalse(extractedContent.Contains("</body><header>"), "Content found after </body>");
                break;

            case 16:
            {
                string sample         = "It's the first day of school";
                int    bodyStart      = extractedContent.IndexOf("<body");
                int    firstPageStart = extractedContent.IndexOf(sample, bodyStart);
                Assert.IsTrue(firstPageStart > -1);
                Assert.AreEqual(-1, extractedContent.IndexOf(sample, firstPageStart + sample.Length), "Article body repeated due to comment paging");
            }
            break;

            case 17:
            {
                string sample         = "everybody should be treated equally";
                int    firstPageStart = extractedContent.IndexOf(sample);
                Assert.IsTrue(firstPageStart > -1);
                Assert.AreEqual(-1, extractedContent.IndexOf(sample, firstPageStart + sample.Length), "Article body repeated due to conditional comment parsing");
            }
            break;

            case 18:
            {
                Assert.IsTrue(extractedContent.Contains("When Ben Franklin wrote"), "Missing start of text");
            }
            break;

            default:
                throw new NotSupportedException("Unknown sample input number (" + sampleInputNumber + "). Have you added another sample input? If so, then add appropriate asserts here as well.");
            }
        }
        public void Transcode_returns_title_if_it_can_be_extracted()
        {
            // arrange
              const string expectedTitle = "Some title �����";
              const string htmlContent = "<html><head><title>" + expectedTitle + "</title></head><body></body></html>";

              var nReadabilityTranscoder = new NReadabilityTranscoder();
              var urlFetcher = new SimpleUrlFetcherStub(htmlContent);
              var nReadabilityWebTranscoder = new NReadabilityWebTranscoder(nReadabilityTranscoder, urlFetcher);

              var webTranscodingInput = new WebTranscodingInput("http://dummy.com/");

              // act
              WebTranscodingResult webTranscodingResult = nReadabilityWebTranscoder.Transcode(webTranscodingInput);

              // assert
              Assert.IsTrue(webTranscodingResult.TitleExtracted);
              Assert.AreEqual(expectedTitle, webTranscodingResult.ExtractedTitle);
        }
Beispiel #27
0
        public async Task Dowload(string url, PerformContext context)
        {
            using (var client = new HttpClient())
            {
                client.DefaultRequestHeaders.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36");
                var response = await client.GetAsync(url);

                if (response.StatusCode != HttpStatusCode.OK)
                {
                    return;
                }

                var stream = await response.Content.ReadAsStreamAsync();

                byte[] bytes = new byte[stream.Length];
                await stream.ReadAsync(bytes, 0, bytes.Length);

                var isUTF8 = IsTextUTF8(ref bytes);
                Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
                Encoding encoding;
                if (isUTF8)
                {
                    encoding = Encoding.UTF8;
                }
                else
                {
                    encoding = Encoding.GetEncoding("GBK");
                }

                var html = encoding.GetString(bytes);
                //var document = new HtmlDocument { OptionAutoCloseOnEnd = true };

                //document.LoadHtml(html);
                //foreach (var selectNode in document.DocumentNode.SelectNodes("//meta"))
                //{
                //    if (selectNode.Attributes["http-equiv"]?.Value == "Content-Type")
                //    {
                //        var contentType = selectNode.Attributes["content"].Value;
                //        var match = Regex.Match(contentType, "charset=(?<encoding>[a-zA-Z0-9\\-]*)");
                //        if (match.Success)
                //        {
                //            var encodingName = match.Groups["encoding"].Value;
                //            html = Encoding.GetEncoding(encodingName).GetString(bytes);
                //            break;
                //        }
                //    }

                //    if (selectNode.Attributes["charset"] != null)
                //    {
                //        var encodingName = selectNode.Attributes["charset"].Value;
                //        html = Encoding.GetEncoding(encodingName).GetString(bytes);
                //        break;
                //    }
                //}
                //document.LoadHtml(html);
                //using (var ms = new MemoryStream())
                //using (StreamWriter sw = new StreamWriter(ms, Encoding.UTF8))
                //{
                //    document.Save(sw);
                //    ms.Position = 0;
                //    var xdoc = XDocument.Load(ms);
                //    //using (var sr = new StreamReader(ms))
                //    //{

                //    //    html = await sr.ReadToEndAsync();
                //    //}
                //}

                //var html = await response.Content.ReadAsStringAsync();
                if (string.IsNullOrEmpty(html))
                {
                    return;
                }

                var transcoder = new NReadabilityTranscoder();
                var input      = new TranscodingInput(html);
                try
                {
                    SgmlDomBuilder builder = new SgmlDomBuilder();
                    var            s       = builder.BuildDocument(html);
                    var            result  = transcoder.Transcode(input);

                    var document = new HtmlDocument {
                        OptionAutoCloseOnEnd = true
                    };
                    document.LoadHtml(result.ExtractedContent);
                    var node = document.DocumentNode.SelectSingleNode("//div/div/div/div");
                    var text = node.InnerText.Trim('\r', '\n', ' ', '\t');
                    context.WriteLine("抽取内容为:");
                    context.WriteLine(text);

                    const string cmdText = @"UPDATE [dbo].[BaiduNews] SET [Html]=@Html,[Text]=@Text WHERE [Url]=@Url";

                    await _connection.ExecuteAsync(cmdText, new { Html = html, Text = text, Url = url });

                    await _connection.ExecuteAsync(
                        @"UPDATE a SET a.[NewsCount]=a.[NewsCount]+1 FROM [dbo].[Monitor] a JOIN [dbo].[BaiduNews] b ON a.[Tag]=b.[Keyword] WHERE b.[Url]=@Url",
                        new { Url = url });
                }
                catch (Exception e)
                {
                    context.WriteLine(e);
                }
            }
        }