// GET: Show the contents of the article for reading
        public ActionResult Read(Article article)
        {
            try
            {
                var    t = new NReadabilityWebTranscoder();
                bool   b;
                string page = t.Transcode(article.url, out b);

                if (b)
                {
                    HtmlDocument doc = new HtmlDocument();
                    doc.LoadHtml(page);

                    var mainText = doc.DocumentNode.SelectSingleNode("//div[@id='readInner']").InnerText;
                    article.mainText = mainText;
                }

                return(View(article));
            }
            catch (Exception ex)
            {
                logger.Error(ex.Message);
                return(View("Error"));
            }
        }
Exemplo n.º 2
0
        private void button1_Click(object sender, EventArgs e)
        {
            var  transcoder = new NReadabilityWebTranscoder();
            bool success; string transcodedContent =

                transcoder.Transcode("http://news.163.com/17/0522/16/CL294BVU000187VE.html", out success);
            richTextBox1.Text = transcodedContent;
        }
Exemplo n.º 3
0
        List <string> ParseNReadAndSaveDB(TSource S)
        {
            var link = S.Link;

            if (lastGoogleReq.AddMinutes(20) > DateTime.Now)//5 мин чтоб не забанили todo
            {
                return(null);
            }
            lastGoogleReq = DateTime.Now;
            var urls = NRParser.GetInfoBySearchStr(link);//dictionary


            var RawItems = new List <string>();

            foreach (var url in urls)
            {
                var    transcoder        = new NReadabilityWebTranscoder();
                bool   success           = false;
                string transcodedContent = "";
                try
                {
                    transcodedContent = transcoder.Transcode(url.Key, out success);
                }
                catch (Exception e) { log.Error(url + " не смогли распарсить ", e); }//todo
                if (success)
                {
                    var user = db.TLogin.FirstOrDefault(_ => _.Login == _login);
                    if (user == null)
                    {
                        throw new ArgumentNullException();
                    }
                    HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
                    doc.LoadHtml(transcodedContent);
                    var bodyNode = doc.DocumentNode.SelectSingleNode("//div[@id='readInner']")
                                   .InnerText;
                    try
                    {
                        var Article = new TArticle()
                        {
                            Data    = bodyNode,
                            OwnLink = url.Key,
                            Title   = url.Value, //doc.DocumentNode.SelectSingleNode("//div[@id='readInner']").InnerText.Substring(0, 200),
                            Date    = DateTime.Now
                        };                       ////////////////////////////////////////////////////////////////////////////!!!!

                        S.TArticles.Add(Article);
                        //user.TSources.First(_ => _.Type == (int)SourceType.Url)
                        //    .TArticles.Add(Article);
                    }
                    catch { log.Error("упал при Article = new TArticle()..."); }
                    db.SaveChanges();
                    RawItems.Add(bodyNode);//todo  не сделано
                }
            }
            return(RawItems);
        }
        public void TestSampleInputs([Values(1, 2, 3, 4)] int sampleInputNumber)
        {
            string sampleInputNumberStr = sampleInputNumber.ToString().PadLeft(2, '0');

            string[]    urls       = _Urls[sampleInputNumber - 1];
            string      initialUrl = urls[0];
            IUrlFetcher fetcher    = new UrlFetcherStub(sampleInputNumber, urls);

            _nReadabilityWebTranscoder = new NReadabilityWebTranscoder(_nReadabilityTranscoder, fetcher);
            bool         mainContentExtracted;
            string       transcodedContent = _nReadabilityWebTranscoder.Transcode(initialUrl, out mainContentExtracted);
            const string outputDir         = "SampleWebOutput";

            if (!Directory.Exists(outputDir))
            {
                Directory.CreateDirectory(outputDir);
            }

            File.WriteAllText(
                Path.Combine(outputDir, string.Format("SampleOutput_{0}.html", sampleInputNumberStr)),
                transcodedContent,
                Encoding.UTF8);

            switch (sampleInputNumber)
            {
            case 1:
                Assert.IsTrue(transcodedContent.Contains(" freedom of movement or expression would constitute a new and unacceptable denial"));
                Assert.IsTrue(transcodedContent.Contains("Those expectations were on display in the crowd outside her house on Saturday."));
                Assert.That(Regex.Matches(transcodedContent, "Myanmar Junta Frees Dissident Daw Aung San Suu Kyi").Count, Is.EqualTo(4));
                break;

            case 2:
                Assert.IsTrue(transcodedContent.Contains("For Louie and Phil, the conversations did more than keep their minds sharp."));
                Assert.IsTrue(transcodedContent.Contains("It was absolutely dark and absolutely silent, save for the chattering of Phil’s teeth."));
                Assert.IsTrue(transcodedContent.Contains("A serial runaway and artful dodger"));
                Assert.That(Regex.Matches(transcodedContent, @"Adrift but Unbroken \| Politics").Count, Is.EqualTo(2));
                break;

            case 3:
                Assert.IsTrue(transcodedContent.Contains("The Chinese system as a whole has great weaknesses as well as great strengths."));
                Assert.IsTrue(transcodedContent.Contains(" This emphasis on limits is what begins pointing us back to coal."));
                Assert.IsTrue(transcodedContent.Contains(". For example, the possibility of dramatic rises in ocean levels, which could affect the habitability"));
                Assert.That(Regex.Matches(transcodedContent, "Dirty Coal, Clean Future - Magazine").Count, Is.EqualTo(3)); // Makes sure the title isn't duplicated
                break;

            case 4: // Test duplicate content on subsequent page
                Assert.That(Regex.Matches(transcodedContent, "his may seem paradoxical, or backward").Count, Is.EqualTo(1));
                break;

            default:
                throw new NotSupportedException("Unknown sample input number (" + sampleInputNumber + "). Have you added another sample input? If so, then add appropriate asserts here as well.");
            }

            Assert.IsTrue(mainContentExtracted);
        }
        public void TestSampleInputs([Values(1,2,3,4)]int sampleInputNumber)
        {
            string sampleInputNumberStr = sampleInputNumber.ToString().PadLeft(2, '0');
              string[] urls = _Urls[sampleInputNumber - 1];
              string initialUrl = urls[0];
              IUrlFetcher fetcher = new UrlFetcherStub(sampleInputNumber, urls);
              _nReadabilityWebTranscoder = new NReadabilityWebTranscoder(_nReadabilityTranscoder, fetcher);
              bool mainContentExtracted;
              string transcodedContent = _nReadabilityWebTranscoder.Transcode(initialUrl, out mainContentExtracted);
              const string outputDir = "SampleWebOutput";

              if (!Directory.Exists(outputDir))
              {
            Directory.CreateDirectory(outputDir);
              }

              File.WriteAllText(
               Path.Combine(outputDir, string.Format("SampleOutput_{0}.html", sampleInputNumberStr)),
               transcodedContent,
               Encoding.UTF8);

              switch (sampleInputNumber)
              {
            case 1:
              Assert.IsTrue(transcodedContent.Contains(" freedom of movement or expression would constitute a new and unacceptable denial"));
              Assert.IsTrue(transcodedContent.Contains("Those expectations were on display in the crowd outside her house on Saturday."));
              Assert.That(Regex.Matches(transcodedContent, "Myanmar Junta Frees Dissident Daw Aung San Suu Kyi").Count, Is.EqualTo(4));
              break;

            case 2:
              Assert.IsTrue(transcodedContent.Contains("For Louie and Phil, the conversations did more than keep their minds sharp."));
              Assert.IsTrue(transcodedContent.Contains("It was absolutely dark and absolutely silent, save for the chattering of Phil’s teeth."));
              Assert.IsTrue(transcodedContent.Contains("A serial runaway and artful dodger"));
              Assert.That(Regex.Matches(transcodedContent, @"Adrift but Unbroken \| Politics").Count, Is.EqualTo(2));
              break;

            case 3:
              Assert.IsTrue(transcodedContent.Contains("The Chinese system as a whole has great weaknesses as well as great strengths."));
              Assert.IsTrue(transcodedContent.Contains(" This emphasis on limits is what begins pointing us back to coal."));
              Assert.IsTrue(transcodedContent.Contains(". For example, the possibility of dramatic rises in ocean levels, which could affect the habitability"));
              Assert.That(Regex.Matches(transcodedContent, "Dirty Coal, Clean Future - Magazine").Count, Is.EqualTo(3)); // Makes sure the title isn't duplicated
              break;

            case 4:  // Test duplicate content on subsequent page
              Assert.That(Regex.Matches(transcodedContent, "his may seem paradoxical, or backward").Count, Is.EqualTo(1));
              break;

            default:
              throw new NotSupportedException("Unknown sample input number (" + sampleInputNumber + "). Have you added another sample input? If so, then add appropriate asserts here as well.");
              }

              Assert.IsTrue(mainContentExtracted);
        }
        public void Transcode_returns_title_if_it_can_be_extracted()
        {
            // arrange
            const string expectedTitle = "Some title ¹ê³ó¿Ÿñæ";
            const string htmlContent   = "<html><head><title>" + expectedTitle + "</title></head><body></body></html>";

            var nReadabilityTranscoder    = new NReadabilityTranscoder();
            var urlFetcher                = new SimpleUrlFetcherStub(htmlContent);
            var nReadabilityWebTranscoder = new NReadabilityWebTranscoder(nReadabilityTranscoder, urlFetcher);

            var webTranscodingInput = new WebTranscodingInput("http://dummy.com/");

            // act
            WebTranscodingResult webTranscodingResult = nReadabilityWebTranscoder.Transcode(webTranscodingInput);

            // assert
            Assert.IsTrue(webTranscodingResult.TitleExtracted);
            Assert.AreEqual(expectedTitle, webTranscodingResult.ExtractedTitle);
        }
Exemplo n.º 7
0
        /// <summary>
        /// Initializes a new instance of the <see cref="Appleseed.Base.Data.WebPageRepository"/> class.
        /// </summary>
        /// <param name="incomingConnectionURL">Incoming connection UR.</param>
        /// <param name="logger">Logger.</param>
        public WebPageDataService(string incomingConnectionURL, Logger logger)
        {
            this.Log           = logger;
            this.ConnectionURL = incomingConnectionURL;

            //DONE: implement a page scraper via NReadability or an API
            // maybe use a REST service to get image/ etc and NReadability to the article itself
            // https://www.mashape.com/pbkwee/html2text + http://scraper.io/

            try {
                NReadabilityWebTranscoder wt  = new NReadabilityWebTranscoder();
                WebTranscodingResult      wtr = wt.Transcode(new WebTranscodingInput(this.ConnectionURL));

                this.ExtractedContent = wtr.ExtractedContent;
                this.ExtractedTitle   = wtr.ExtractedTitle;
                this.ExtractedImage   = "";
            } catch (Exception ex) {
                Log.ErrorException("Error", ex);
            }
        }
Exemplo n.º 8
0
        public static string[] ParseNewsHMTL(string url)
        {
            var           transcoder = new NReadabilityWebTranscoder();
            List <string> result     = new List <string>();

            try
            {
                WebTranscodingInput  input            = new WebTranscodingInput(url);
                WebTranscodingResult transcodeContent = transcoder.Transcode(input);
                string       HTML     = transcodeContent.ExtractedContent;
                HtmlDocument document = new HtmlDocument();
                document.LoadHtml(HTML);
                foreach (HtmlNode paragraph in document.DocumentNode.SelectNodes("//p"))
                {
                    // do something with the paragraph node here
                    result.Add(paragraph.InnerText.ToLower()); // or something similar
                }
            } catch (Exception e)
            {
                Console.WriteLine(e.Message);
            }
            return(result.ToArray());
        }
        public void Transcode_returns_title_if_it_can_be_extracted()
        {
            // arrange
              const string expectedTitle = "Some title �����";
              const string htmlContent = "<html><head><title>" + expectedTitle + "</title></head><body></body></html>";

              var nReadabilityTranscoder = new NReadabilityTranscoder();
              var urlFetcher = new SimpleUrlFetcherStub(htmlContent);
              var nReadabilityWebTranscoder = new NReadabilityWebTranscoder(nReadabilityTranscoder, urlFetcher);

              var webTranscodingInput = new WebTranscodingInput("http://dummy.com/");

              // act
              WebTranscodingResult webTranscodingResult = nReadabilityWebTranscoder.Transcode(webTranscodingInput);

              // assert
              Assert.IsTrue(webTranscodingResult.TitleExtracted);
              Assert.AreEqual(expectedTitle, webTranscodingResult.ExtractedTitle);
        }
        public void TestSampleInputs([Values(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)]int sampleInputNumber)
        {
            const string outputDir = "SampleWebOutput";

              string sampleInputNumberStr = sampleInputNumber.ToString().PadLeft(2, '0');
              string[] urls = _Urls[sampleInputNumber];
              string initialUrl = urls[0];

              var fetcher = new FileBasedUrlFetcherStub(sampleInputNumber, urls);
              var nReadabilityTranscoder = new NReadabilityTranscoder();
              var nReadabilityWebTranscoder = new NReadabilityWebTranscoder(nReadabilityTranscoder, fetcher);

              var webTranscodingInput = new WebTranscodingInput(initialUrl);

              WebTranscodingResult webTranscodingResult = nReadabilityWebTranscoder.Transcode(webTranscodingInput);

              Assert.IsTrue(webTranscodingResult.ContentExtracted);

              if (!Directory.Exists(outputDir))
              {
            Directory.CreateDirectory(outputDir);
              }

              string extractedContent = webTranscodingResult.ExtractedContent;

              File.WriteAllText(
            Path.Combine(outputDir, string.Format("SampleOutput_{0}.html", sampleInputNumberStr)),
            extractedContent,
            Encoding.UTF8);

              Assert.IsTrue(extractedContent.IndexOf("<html") == extractedContent.LastIndexOf("<html"));
              Assert.IsTrue(extractedContent.IndexOf("</html") == extractedContent.LastIndexOf("</html"));

              switch (sampleInputNumber)
              {
            case 1:
              Assert.IsTrue(extractedContent.Contains(" freedom of movement or expression would constitute a new and unacceptable denial"));
              Assert.IsTrue(extractedContent.Contains("Those expectations were on display in the crowd outside her house on Saturday."));
              Assert.That(Regex.Matches(extractedContent, "Myanmar Junta Frees Dissident Daw Aung San Suu Kyi").Count, Is.EqualTo(4));
              break;

            case 2:
              Assert.IsTrue(extractedContent.Contains("For Louie and Phil, the conversations did more than keep their minds sharp."));
              Assert.IsTrue(extractedContent.Contains("It was absolutely dark and absolutely silent, save for the chattering of Phil�s teeth."));
              Assert.IsTrue(extractedContent.Contains("A serial runaway and artful dodger"));
              Assert.That(Regex.Matches(extractedContent, @"Adrift but Unbroken \| Politics").Count, Is.EqualTo(2));
              break;

            case 3:
              Assert.IsTrue(extractedContent.Contains("The Chinese system as a whole has great weaknesses as well as great strengths."));
              Assert.IsTrue(extractedContent.Contains(" This emphasis on limits is what begins pointing us back to coal."));
              Assert.IsTrue(extractedContent.Contains(". For example, the possibility of dramatic rises in ocean levels, which could affect the habitability"));
              Assert.That(Regex.Matches(extractedContent, "Dirty Coal, Clean Future - Magazine").Count, Is.EqualTo(3)); // Makes sure the title isn't duplicated
              break;

            case 4:  // Test duplicate content on subsequent page
              Assert.That(Regex.Matches(extractedContent, "his may seem paradoxical, or backward").Count, Is.EqualTo(1));
              break;

            case 5:
              // page 1
              Assert.IsTrue(extractedContent.Contains("The pressure's on, and as you glance back and forth between your rack and the board, you can hardly believe your eyes at the play you can make."));
              Assert.IsTrue(extractedContent.Contains("How can you take your game to the next level? Let's start by looking at game play."));
              // page 2
              Assert.IsTrue(extractedContent.Contains("The object of Scrabble is to get the most points by creating words."));
              Assert.IsTrue(extractedContent.Contains("Now that you know the parts of the game, let's take a look at how to play it."));
              // page 3
              Assert.IsTrue(extractedContent.Contains("To determine who goes first, put all the tiles into the bag and mix them up."));
              Assert.IsTrue(extractedContent.Contains("The game continues until one player uses all of his tiles and there aren't any in the pouch, or if there are no more tiles and no one can make a word. Add up the total of your unplayed tiles and deduct it from your score. If you've used all of your tiles, add the total of the unplayed tiles to your score. The winner has the most points."));
              // page 4
              Assert.IsTrue(extractedContent.Contains("If you play often enough, you'll need to learn how to play the board in order to get the highest score"));
              Assert.IsTrue(extractedContent.Contains("With the game's popularity, it now comes in many variations. Let's take a look at some different ways to play Scrabble."));
              // page 5
              Assert.IsTrue(extractedContent.Contains("Many people play Scrabble on a traditional flat board with the grid imprinted on it."));
              Assert.IsTrue(extractedContent.Contains("With its worldwide popularity, it only makes sense that Scrabble comes in languages other than English. "));
              break;

            case 6:
              // page 1
              Assert.IsTrue(extractedContent.Contains("In the aftermath of his resignation and then his death"));
              Assert.IsTrue(extractedContent.Contains("Curb Your Enthusiasm"));
              // page 2
              Assert.IsTrue(extractedContent.Contains("Jobs also seemed to suspect that he"));
              Assert.IsTrue(extractedContent.Contains("And, sadly, it may remain one forever."));
              break;

            case 7:
              // page 1
              Assert.IsTrue(extractedContent.Contains("post also betrays some misconceptions regarding our report."));
              Assert.IsTrue(extractedContent.Contains("After all, none of us can resist the occasional study"));
              // "page" 2 (false positive)
              Assert.IsFalse(extractedContent.Contains("In expressing this view, Clinton joins many Americans who worry about online misinformation, loss of privacy, and identity theft."));
              break;

            case 8:
              // page 1
              Assert.IsTrue(extractedContent.Contains("For the last couple of days we've been asking people"));
              Assert.IsTrue(extractedContent.Contains("list your favorite tools for slowing down feeds in the comments"));
              // "page" 2 (false positive)
              Assert.IsFalse(extractedContent.Contains("signature fake news programs"));
              break;

            case 9:
              // page 1
              Assert.IsTrue(extractedContent.Contains("The story is narrated by a young girl named Jean Louise"));
              Assert.IsTrue(extractedContent.Contains("toward adulthood."));
              // page 2
              Assert.IsTrue(extractedContent.Contains("September arrives, and Dill leaves Maycomb to return to"));
              Assert.IsTrue(extractedContent.Contains("educational technique but the law."));
              break;

            case 10:
              // page 1
              Assert.IsTrue(extractedContent.Contains("he fire at the Triangle Waist Company"));
              Assert.IsTrue(extractedContent.Contains("at the hands of industrial greed."));
              // page 2
              Assert.IsTrue(extractedContent.Contains("he Triangle Waist Company was in many ways"));
              Assert.IsTrue(extractedContent.Contains("unsafe working conditions on their employees."));
              // page 3 (last)
              Assert.IsTrue(extractedContent.Contains("mmediately after the fire, Triangle owners Blanck and Harris"));
              Assert.IsTrue(extractedContent.Contains("and that it was \"second to none in the country.\""));
              break;

            case 11:
              Assert.IsTrue(extractedContent.Contains("More than 20 percent of the world�s oxygen comes from the Amazon Rainforest."));
              Assert.IsTrue(extractedContent.Contains("practical ways to shrink the size of your step."));
              break;

            case 12:
              // Actual tumblr post
              Assert.IsTrue(extractedContent.Contains("First of all, you should watch this video."));
              // Next tumlbr post, linked from first - should not be included
              Assert.IsFalse(extractedContent.Contains("I�ll let Neil deGrasse Tyson set this up"));
              break;

            case 13:
              Assert.IsTrue(extractedContent.Contains("Back in 2003"));
              break;

            case 14:
              Assert.IsFalse(extractedContent.Contains("</body><a"), "Content found after </body>");
              break;

            case 15:
              Assert.IsFalse(extractedContent.Contains("</body><header>"), "Content found after </body>");
              break;

            case 16:
              {
            string sample = "It's the first day of school";
            int bodyStart = extractedContent.IndexOf("<body");
            int firstPageStart = extractedContent.IndexOf(sample, bodyStart);
            Assert.IsTrue(firstPageStart > -1);
            Assert.AreEqual(-1, extractedContent.IndexOf(sample, firstPageStart + sample.Length), "Article body repeated due to comment paging");
              }
              break;

            case 17:
              {
            string sample = "everybody should be treated equally";
            int firstPageStart = extractedContent.IndexOf(sample);
            Assert.IsTrue(firstPageStart > -1);
            Assert.AreEqual(-1, extractedContent.IndexOf(sample, firstPageStart + sample.Length), "Article body repeated due to conditional comment parsing");
              }
              break;

            case 18:
              {
            Assert.IsTrue(extractedContent.Contains("When Ben Franklin wrote"), "Missing start of text");
              }
              break;

            default:
              throw new NotSupportedException("Unknown sample input number (" + sampleInputNumber + "). Have you added another sample input? If so, then add appropriate asserts here as well.");
              }
        }
Exemplo n.º 11
0
        public void TestSampleInputs([Values(1, 2, 3, 4, 5, 6, 7, 8)] int sampleInputNumber)
        {
            const string outputDir = "SampleWebOutput";

            string sampleInputNumberStr = sampleInputNumber.ToString().PadLeft(2, '0');

            string[] urls       = _Urls[sampleInputNumber - 1];
            string   initialUrl = urls[0];

            var fetcher = new FileBasedUrlFetcherStub(sampleInputNumber, urls);
            var _nReadabilityTranscoder    = new NReadabilityTranscoder();
            var _nReadabilityWebTranscoder = new NReadabilityWebTranscoder(_nReadabilityTranscoder, fetcher);

            bool mainContentExtracted;

            string transcodedContent =
                _nReadabilityWebTranscoder
                .Transcode(
                    initialUrl,
                    out mainContentExtracted);

            if (!Directory.Exists(outputDir))
            {
                Directory.CreateDirectory(outputDir);
            }

            File.WriteAllText(
                Path.Combine(outputDir, string.Format("SampleOutput_{0}.html", sampleInputNumberStr)),
                transcodedContent,
                Encoding.UTF8);

            switch (sampleInputNumber)
            {
            case 1:
                Assert.IsTrue(transcodedContent.Contains(" freedom of movement or expression would constitute a new and unacceptable denial"));
                Assert.IsTrue(transcodedContent.Contains("Those expectations were on display in the crowd outside her house on Saturday."));
                Assert.That(Regex.Matches(transcodedContent, "Myanmar Junta Frees Dissident Daw Aung San Suu Kyi").Count, Is.EqualTo(4));
                break;

            case 2:
                Assert.IsTrue(transcodedContent.Contains("For Louie and Phil, the conversations did more than keep their minds sharp."));
                Assert.IsTrue(transcodedContent.Contains("It was absolutely dark and absolutely silent, save for the chattering of Phil’s teeth."));
                Assert.IsTrue(transcodedContent.Contains("A serial runaway and artful dodger"));
                Assert.That(Regex.Matches(transcodedContent, @"Adrift but Unbroken \| Politics").Count, Is.EqualTo(2));
                break;

            case 3:
                Assert.IsTrue(transcodedContent.Contains("The Chinese system as a whole has great weaknesses as well as great strengths."));
                Assert.IsTrue(transcodedContent.Contains(" This emphasis on limits is what begins pointing us back to coal."));
                Assert.IsTrue(transcodedContent.Contains(". For example, the possibility of dramatic rises in ocean levels, which could affect the habitability"));
                Assert.That(Regex.Matches(transcodedContent, "Dirty Coal, Clean Future - Magazine").Count, Is.EqualTo(3)); // Makes sure the title isn't duplicated
                break;

            case 4: // Test duplicate content on subsequent page
                Assert.That(Regex.Matches(transcodedContent, "his may seem paradoxical, or backward").Count, Is.EqualTo(1));
                break;

            case 5:
                // page 1
                Assert.IsTrue(transcodedContent.Contains("The pressure's on, and as you glance back and forth between your rack and the board, you can hardly believe your eyes at the play you can make."));
                Assert.IsTrue(transcodedContent.Contains("How can you take your game to the next level? Let's start by looking at game play."));
                // page 2
                Assert.IsTrue(transcodedContent.Contains("The object of Scrabble is to get the most points by creating words."));
                Assert.IsTrue(transcodedContent.Contains("Now that you know the parts of the game, let's take a look at how to play it."));
                // page 3
                Assert.IsTrue(transcodedContent.Contains("To determine who goes first, put all the tiles into the bag and mix them up."));
                Assert.IsTrue(transcodedContent.Contains("The game continues until one player uses all of his tiles and there aren't any in the pouch, or if there are no more tiles and no one can make a word. Add up the total of your unplayed tiles and deduct it from your score. If you've used all of your tiles, add the total of the unplayed tiles to your score. The winner has the most points."));
                // page 4
                Assert.IsTrue(transcodedContent.Contains("If you play often enough, you'll need to learn how to play the board in order to get the highest score"));
                Assert.IsTrue(transcodedContent.Contains("With the game's popularity, it now comes in many variations. Let's take a look at some different ways to play Scrabble."));
                // page 5
                Assert.IsTrue(transcodedContent.Contains("Many people play Scrabble on a traditional flat board with the grid imprinted on it."));
                Assert.IsTrue(transcodedContent.Contains("With its worldwide popularity, it only makes sense that Scrabble comes in languages other than English. "));
                break;

            case 6:
                // page 1
                Assert.IsTrue(transcodedContent.Contains("In the aftermath of his resignation and then his death"));
                Assert.IsTrue(transcodedContent.Contains("Curb Your Enthusiasm"));
                // page 2
                Assert.IsTrue(transcodedContent.Contains("Jobs also seemed to suspect that he"));
                Assert.IsTrue(transcodedContent.Contains("And, sadly, it may remain one forever."));
                break;

            case 7:
                // page 1
                Assert.IsTrue(transcodedContent.Contains("post also betrays some misconceptions regarding our report."));
                Assert.IsTrue(transcodedContent.Contains("After all, none of us can resist the occasional study"));
                // "page" 2 (false positive)
                Assert.IsFalse(transcodedContent.Contains("In expressing this view, Clinton joins many Americans who worry about online misinformation, loss of privacy, and identity theft."));
                break;

            case 8:
                // page 1
                Assert.IsTrue(transcodedContent.Contains("For the last couple of days we've been asking people"));
                Assert.IsTrue(transcodedContent.Contains("list your favorite tools for slowing down feeds in the comments"));
                // "page" 2 (false positive)
                Assert.IsFalse(transcodedContent.Contains("signature fake news programs"));
                break;

            default:
                throw new NotSupportedException("Unknown sample input number (" + sampleInputNumber + "). Have you added another sample input? If so, then add appropriate asserts here as well.");
            }

            Assert.IsTrue(mainContentExtracted);
        }
        public void TestSampleInputs([Values(1, 2, 3, 4, 5, 6, 7, 8)]int sampleInputNumber)
        {
            const string outputDir = "SampleWebOutput";

              string sampleInputNumberStr = sampleInputNumber.ToString().PadLeft(2, '0');
              string[] urls = _Urls[sampleInputNumber - 1];
              string initialUrl = urls[0];

              var fetcher = new UrlFetcherStub(sampleInputNumber, urls);
              var _nReadabilityTranscoder = new NReadabilityTranscoder();
              var _nReadabilityWebTranscoder = new NReadabilityWebTranscoder(_nReadabilityTranscoder, fetcher);

              bool mainContentExtracted;

              string transcodedContent =
            _nReadabilityWebTranscoder
              .Transcode(
            initialUrl,
            out mainContentExtracted);

              if (!Directory.Exists(outputDir))
              {
            Directory.CreateDirectory(outputDir);
              }

              File.WriteAllText(
            Path.Combine(outputDir, string.Format("SampleOutput_{0}.html", sampleInputNumberStr)),
            transcodedContent,
            Encoding.UTF8);

              switch (sampleInputNumber)
              {
            case 1:
              Assert.IsTrue(transcodedContent.Contains(" freedom of movement or expression would constitute a new and unacceptable denial"));
              Assert.IsTrue(transcodedContent.Contains("Those expectations were on display in the crowd outside her house on Saturday."));
              Assert.That(Regex.Matches(transcodedContent, "Myanmar Junta Frees Dissident Daw Aung San Suu Kyi").Count, Is.EqualTo(4));
              break;

            case 2:
              Assert.IsTrue(transcodedContent.Contains("For Louie and Phil, the conversations did more than keep their minds sharp."));
              Assert.IsTrue(transcodedContent.Contains("It was absolutely dark and absolutely silent, save for the chattering of Phil’s teeth."));
              Assert.IsTrue(transcodedContent.Contains("A serial runaway and artful dodger"));
              Assert.That(Regex.Matches(transcodedContent, @"Adrift but Unbroken \| Politics").Count, Is.EqualTo(2));
              break;

            case 3:
              Assert.IsTrue(transcodedContent.Contains("The Chinese system as a whole has great weaknesses as well as great strengths."));
              Assert.IsTrue(transcodedContent.Contains(" This emphasis on limits is what begins pointing us back to coal."));
              Assert.IsTrue(transcodedContent.Contains(". For example, the possibility of dramatic rises in ocean levels, which could affect the habitability"));
              Assert.That(Regex.Matches(transcodedContent, "Dirty Coal, Clean Future - Magazine").Count, Is.EqualTo(3)); // Makes sure the title isn't duplicated
              break;

            case 4:  // Test duplicate content on subsequent page
              Assert.That(Regex.Matches(transcodedContent, "his may seem paradoxical, or backward").Count, Is.EqualTo(1));
              break;

            case 5:
              // page 1
              Assert.IsTrue(transcodedContent.Contains("The pressure's on, and as you glance back and forth between your rack and the board, you can hardly believe your eyes at the play you can make."));
              Assert.IsTrue(transcodedContent.Contains("How can you take your game to the next level? Let's start by looking at game play."));
              // page 2
              Assert.IsTrue(transcodedContent.Contains("The object of Scrabble is to get the most points by creating words."));
              Assert.IsTrue(transcodedContent.Contains("Now that you know the parts of the game, let's take a look at how to play it."));
              // page 3
              Assert.IsTrue(transcodedContent.Contains("To determine who goes first, put all the tiles into the bag and mix them up."));
              Assert.IsTrue(transcodedContent.Contains("The game continues until one player uses all of his tiles and there aren't any in the pouch, or if there are no more tiles and no one can make a word. Add up the total of your unplayed tiles and deduct it from your score. If you've used all of your tiles, add the total of the unplayed tiles to your score. The winner has the most points."));
              // page 4
              Assert.IsTrue(transcodedContent.Contains("If you play often enough, you'll need to learn how to play the board in order to get the highest score"));
              Assert.IsTrue(transcodedContent.Contains("With the game's popularity, it now comes in many variations. Let's take a look at some different ways to play Scrabble."));
              // page 5
              Assert.IsTrue(transcodedContent.Contains("Many people play Scrabble on a traditional flat board with the grid imprinted on it."));
              Assert.IsTrue(transcodedContent.Contains("With its worldwide popularity, it only makes sense that Scrabble comes in languages other than English. "));
              break;

            case 6:
              // page 1
              Assert.IsTrue(transcodedContent.Contains("In the aftermath of his resignation and then his death"));
              Assert.IsTrue(transcodedContent.Contains("Curb Your Enthusiasm"));
              // page 2
              Assert.IsTrue(transcodedContent.Contains("Jobs also seemed to suspect that he"));
              Assert.IsTrue(transcodedContent.Contains("And, sadly, it may remain one forever."));
              break;

            case 7:
              // page 1
              Assert.IsTrue(transcodedContent.Contains("post also betrays some misconceptions regarding our report."));
              Assert.IsTrue(transcodedContent.Contains("After all, none of us can resist the occasional study"));
              // "page" 2 (false positive)
              Assert.IsFalse(transcodedContent.Contains("In expressing this view, Clinton joins many Americans who worry about online misinformation, loss of privacy, and identity theft."));
              break;

            case 8:
              // page 1
              Assert.IsTrue(transcodedContent.Contains("For the last couple of days we’ve been asking people"));
              Assert.IsTrue(transcodedContent.Contains("list your favorite tools for slowing down feeds in the comments"));
              // "page" 2 (false positive)
              Assert.IsFalse(transcodedContent.Contains("signature fake news programs"));
              break;

            default:
              throw new NotSupportedException("Unknown sample input number (" + sampleInputNumber + "). Have you added another sample input? If so, then add appropriate asserts here as well.");
              }

              Assert.IsTrue(mainContentExtracted);
        }
Exemplo n.º 13
0
        public void TestSampleInputs([Values(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)] int sampleInputNumber)
        {
            const string outputDir = "SampleWebOutput";

            string sampleInputNumberStr = sampleInputNumber.ToString().PadLeft(2, '0');

            string[] urls       = _Urls[sampleInputNumber];
            string   initialUrl = urls[0];

            var fetcher = new FileBasedUrlFetcherStub(sampleInputNumber, urls);
            var nReadabilityTranscoder    = new NReadabilityTranscoder();
            var nReadabilityWebTranscoder = new NReadabilityWebTranscoder(nReadabilityTranscoder, fetcher);

            var webTranscodingInput = new WebTranscodingInput(initialUrl);

            WebTranscodingResult webTranscodingResult = nReadabilityWebTranscoder.Transcode(webTranscodingInput);

            Assert.IsTrue(webTranscodingResult.ContentExtracted);

            if (!Directory.Exists(outputDir))
            {
                Directory.CreateDirectory(outputDir);
            }

            string extractedContent = webTranscodingResult.ExtractedContent;

            File.WriteAllText(
                Path.Combine(outputDir, string.Format("SampleOutput_{0}.html", sampleInputNumberStr)),
                extractedContent,
                Encoding.UTF8);

            Assert.IsTrue(extractedContent.IndexOf("<html") == extractedContent.LastIndexOf("<html"));
            Assert.IsTrue(extractedContent.IndexOf("</html") == extractedContent.LastIndexOf("</html"));

            switch (sampleInputNumber)
            {
            case 1:
                Assert.IsTrue(extractedContent.Contains(" freedom of movement or expression would constitute a new and unacceptable denial"));
                Assert.IsTrue(extractedContent.Contains("Those expectations were on display in the crowd outside her house on Saturday."));
                Assert.That(Regex.Matches(extractedContent, "Myanmar Junta Frees Dissident Daw Aung San Suu Kyi").Count, Is.EqualTo(4));
                break;

            case 2:
                Assert.IsTrue(extractedContent.Contains("For Louie and Phil, the conversations did more than keep their minds sharp."));
                Assert.IsTrue(extractedContent.Contains("It was absolutely dark and absolutely silent, save for the chattering of Phil’s teeth."));
                Assert.IsTrue(extractedContent.Contains("A serial runaway and artful dodger"));
                Assert.That(Regex.Matches(extractedContent, @"Adrift but Unbroken \| Politics").Count, Is.EqualTo(2));
                break;

            case 3:
                Assert.IsTrue(extractedContent.Contains("The Chinese system as a whole has great weaknesses as well as great strengths."));
                Assert.IsTrue(extractedContent.Contains(" This emphasis on limits is what begins pointing us back to coal."));
                Assert.IsTrue(extractedContent.Contains(". For example, the possibility of dramatic rises in ocean levels, which could affect the habitability"));
                Assert.That(Regex.Matches(extractedContent, "Dirty Coal, Clean Future - Magazine").Count, Is.EqualTo(3)); // Makes sure the title isn't duplicated
                break;

            case 4: // Test duplicate content on subsequent page
                Assert.That(Regex.Matches(extractedContent, "his may seem paradoxical, or backward").Count, Is.EqualTo(1));
                break;

            case 5:
                // page 1
                Assert.IsTrue(extractedContent.Contains("The pressure's on, and as you glance back and forth between your rack and the board, you can hardly believe your eyes at the play you can make."));
                Assert.IsTrue(extractedContent.Contains("How can you take your game to the next level? Let's start by looking at game play."));
                // page 2
                Assert.IsTrue(extractedContent.Contains("The object of Scrabble is to get the most points by creating words."));
                Assert.IsTrue(extractedContent.Contains("Now that you know the parts of the game, let's take a look at how to play it."));
                // page 3
                Assert.IsTrue(extractedContent.Contains("To determine who goes first, put all the tiles into the bag and mix them up."));
                Assert.IsTrue(extractedContent.Contains("The game continues until one player uses all of his tiles and there aren't any in the pouch, or if there are no more tiles and no one can make a word. Add up the total of your unplayed tiles and deduct it from your score. If you've used all of your tiles, add the total of the unplayed tiles to your score. The winner has the most points."));
                // page 4
                Assert.IsTrue(extractedContent.Contains("If you play often enough, you'll need to learn how to play the board in order to get the highest score"));
                Assert.IsTrue(extractedContent.Contains("With the game's popularity, it now comes in many variations. Let's take a look at some different ways to play Scrabble."));
                // page 5
                Assert.IsTrue(extractedContent.Contains("Many people play Scrabble on a traditional flat board with the grid imprinted on it."));
                Assert.IsTrue(extractedContent.Contains("With its worldwide popularity, it only makes sense that Scrabble comes in languages other than English. "));
                break;

            case 6:
                // page 1
                Assert.IsTrue(extractedContent.Contains("In the aftermath of his resignation and then his death"));
                Assert.IsTrue(extractedContent.Contains("Curb Your Enthusiasm"));
                // page 2
                Assert.IsTrue(extractedContent.Contains("Jobs also seemed to suspect that he"));
                Assert.IsTrue(extractedContent.Contains("And, sadly, it may remain one forever."));
                break;

            case 7:
                // page 1
                Assert.IsTrue(extractedContent.Contains("post also betrays some misconceptions regarding our report."));
                Assert.IsTrue(extractedContent.Contains("After all, none of us can resist the occasional study"));
                // "page" 2 (false positive)
                Assert.IsFalse(extractedContent.Contains("In expressing this view, Clinton joins many Americans who worry about online misinformation, loss of privacy, and identity theft."));
                break;

            case 8:
                // page 1
                Assert.IsTrue(extractedContent.Contains("For the last couple of days we've been asking people"));
                Assert.IsTrue(extractedContent.Contains("list your favorite tools for slowing down feeds in the comments"));
                // "page" 2 (false positive)
                Assert.IsFalse(extractedContent.Contains("signature fake news programs"));
                break;

            case 9:
                // page 1
                Assert.IsTrue(extractedContent.Contains("The story is narrated by a young girl named Jean Louise"));
                Assert.IsTrue(extractedContent.Contains("toward adulthood."));
                // page 2
                Assert.IsTrue(extractedContent.Contains("September arrives, and Dill leaves Maycomb to return to"));
                Assert.IsTrue(extractedContent.Contains("educational technique but the law."));
                break;

            case 10:
                // page 1
                Assert.IsTrue(extractedContent.Contains("he fire at the Triangle Waist Company"));
                Assert.IsTrue(extractedContent.Contains("at the hands of industrial greed."));
                // page 2
                Assert.IsTrue(extractedContent.Contains("he Triangle Waist Company was in many ways"));
                Assert.IsTrue(extractedContent.Contains("unsafe working conditions on their employees."));
                // page 3 (last)
                Assert.IsTrue(extractedContent.Contains("mmediately after the fire, Triangle owners Blanck and Harris"));
                Assert.IsTrue(extractedContent.Contains("and that it was \"second to none in the country.\""));
                break;

            case 11:
                Assert.IsTrue(extractedContent.Contains("More than 20 percent of the world’s oxygen comes from the Amazon Rainforest."));
                Assert.IsTrue(extractedContent.Contains("practical ways to shrink the size of your step."));
                break;

            case 12:
                // Actual tumblr post
                Assert.IsTrue(extractedContent.Contains("First of all, you should watch this video."));
                // Next tumlbr post, linked from first - should not be included
                Assert.IsFalse(extractedContent.Contains("I’ll let Neil deGrasse Tyson set this up"));
                break;

            case 13:
                Assert.IsTrue(extractedContent.Contains("Back in 2003"));
                break;

            case 14:
                Assert.IsFalse(extractedContent.Contains("</body><a"), "Content found after </body>");
                break;

            case 15:
                Assert.IsFalse(extractedContent.Contains("</body><header>"), "Content found after </body>");
                break;

            case 16:
            {
                string sample         = "It's the first day of school";
                int    bodyStart      = extractedContent.IndexOf("<body");
                int    firstPageStart = extractedContent.IndexOf(sample, bodyStart);
                Assert.IsTrue(firstPageStart > -1);
                Assert.AreEqual(-1, extractedContent.IndexOf(sample, firstPageStart + sample.Length), "Article body repeated due to comment paging");
            }
            break;

            case 17:
            {
                string sample         = "everybody should be treated equally";
                int    firstPageStart = extractedContent.IndexOf(sample);
                Assert.IsTrue(firstPageStart > -1);
                Assert.AreEqual(-1, extractedContent.IndexOf(sample, firstPageStart + sample.Length), "Article body repeated due to conditional comment parsing");
            }
            break;

            case 18:
            {
                Assert.IsTrue(extractedContent.Contains("When Ben Franklin wrote"), "Missing start of text");
            }
            break;

            default:
                throw new NotSupportedException("Unknown sample input number (" + sampleInputNumber + "). Have you added another sample input? If so, then add appropriate asserts here as well.");
            }
        }