public void TestSampleInputs([Values(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)]int sampleInputNumber)
        {
            const string outputDir = "SampleWebOutput";

              string sampleInputNumberStr = sampleInputNumber.ToString().PadLeft(2, '0');
              string[] urls = _Urls[sampleInputNumber];
              string initialUrl = urls[0];

              var fetcher = new FileBasedUrlFetcherStub(sampleInputNumber, urls);
              var nReadabilityTranscoder = new NReadabilityTranscoder();
              var nReadabilityWebTranscoder = new NReadabilityWebTranscoder(nReadabilityTranscoder, fetcher);

              var webTranscodingInput = new WebTranscodingInput(initialUrl);

              WebTranscodingResult webTranscodingResult = nReadabilityWebTranscoder.Transcode(webTranscodingInput);

              Assert.IsTrue(webTranscodingResult.ContentExtracted);

              if (!Directory.Exists(outputDir))
              {
            Directory.CreateDirectory(outputDir);
              }

              string extractedContent = webTranscodingResult.ExtractedContent;

              File.WriteAllText(
            Path.Combine(outputDir, string.Format("SampleOutput_{0}.html", sampleInputNumberStr)),
            extractedContent,
            Encoding.UTF8);

              Assert.IsTrue(extractedContent.IndexOf("<html") == extractedContent.LastIndexOf("<html"));
              Assert.IsTrue(extractedContent.IndexOf("</html") == extractedContent.LastIndexOf("</html"));

              switch (sampleInputNumber)
              {
            case 1:
              Assert.IsTrue(extractedContent.Contains(" freedom of movement or expression would constitute a new and unacceptable denial"));
              Assert.IsTrue(extractedContent.Contains("Those expectations were on display in the crowd outside her house on Saturday."));
              Assert.That(Regex.Matches(extractedContent, "Myanmar Junta Frees Dissident Daw Aung San Suu Kyi").Count, Is.EqualTo(4));
              break;

            case 2:
              Assert.IsTrue(extractedContent.Contains("For Louie and Phil, the conversations did more than keep their minds sharp."));
              Assert.IsTrue(extractedContent.Contains("It was absolutely dark and absolutely silent, save for the chattering of Phil�s teeth."));
              Assert.IsTrue(extractedContent.Contains("A serial runaway and artful dodger"));
              Assert.That(Regex.Matches(extractedContent, @"Adrift but Unbroken \| Politics").Count, Is.EqualTo(2));
              break;

            case 3:
              Assert.IsTrue(extractedContent.Contains("The Chinese system as a whole has great weaknesses as well as great strengths."));
              Assert.IsTrue(extractedContent.Contains(" This emphasis on limits is what begins pointing us back to coal."));
              Assert.IsTrue(extractedContent.Contains(". For example, the possibility of dramatic rises in ocean levels, which could affect the habitability"));
              Assert.That(Regex.Matches(extractedContent, "Dirty Coal, Clean Future - Magazine").Count, Is.EqualTo(3)); // Makes sure the title isn't duplicated
              break;

            case 4:  // Test duplicate content on subsequent page
              Assert.That(Regex.Matches(extractedContent, "his may seem paradoxical, or backward").Count, Is.EqualTo(1));
              break;

            case 5:
              // page 1
              Assert.IsTrue(extractedContent.Contains("The pressure's on, and as you glance back and forth between your rack and the board, you can hardly believe your eyes at the play you can make."));
              Assert.IsTrue(extractedContent.Contains("How can you take your game to the next level? Let's start by looking at game play."));
              // page 2
              Assert.IsTrue(extractedContent.Contains("The object of Scrabble is to get the most points by creating words."));
              Assert.IsTrue(extractedContent.Contains("Now that you know the parts of the game, let's take a look at how to play it."));
              // page 3
              Assert.IsTrue(extractedContent.Contains("To determine who goes first, put all the tiles into the bag and mix them up."));
              Assert.IsTrue(extractedContent.Contains("The game continues until one player uses all of his tiles and there aren't any in the pouch, or if there are no more tiles and no one can make a word. Add up the total of your unplayed tiles and deduct it from your score. If you've used all of your tiles, add the total of the unplayed tiles to your score. The winner has the most points."));
              // page 4
              Assert.IsTrue(extractedContent.Contains("If you play often enough, you'll need to learn how to play the board in order to get the highest score"));
              Assert.IsTrue(extractedContent.Contains("With the game's popularity, it now comes in many variations. Let's take a look at some different ways to play Scrabble."));
              // page 5
              Assert.IsTrue(extractedContent.Contains("Many people play Scrabble on a traditional flat board with the grid imprinted on it."));
              Assert.IsTrue(extractedContent.Contains("With its worldwide popularity, it only makes sense that Scrabble comes in languages other than English. "));
              break;

            case 6:
              // page 1
              Assert.IsTrue(extractedContent.Contains("In the aftermath of his resignation and then his death"));
              Assert.IsTrue(extractedContent.Contains("Curb Your Enthusiasm"));
              // page 2
              Assert.IsTrue(extractedContent.Contains("Jobs also seemed to suspect that he"));
              Assert.IsTrue(extractedContent.Contains("And, sadly, it may remain one forever."));
              break;

            case 7:
              // page 1
              Assert.IsTrue(extractedContent.Contains("post also betrays some misconceptions regarding our report."));
              Assert.IsTrue(extractedContent.Contains("After all, none of us can resist the occasional study"));
              // "page" 2 (false positive)
              Assert.IsFalse(extractedContent.Contains("In expressing this view, Clinton joins many Americans who worry about online misinformation, loss of privacy, and identity theft."));
              break;

            case 8:
              // page 1
              Assert.IsTrue(extractedContent.Contains("For the last couple of days we've been asking people"));
              Assert.IsTrue(extractedContent.Contains("list your favorite tools for slowing down feeds in the comments"));
              // "page" 2 (false positive)
              Assert.IsFalse(extractedContent.Contains("signature fake news programs"));
              break;

            case 9:
              // page 1
              Assert.IsTrue(extractedContent.Contains("The story is narrated by a young girl named Jean Louise"));
              Assert.IsTrue(extractedContent.Contains("toward adulthood."));
              // page 2
              Assert.IsTrue(extractedContent.Contains("September arrives, and Dill leaves Maycomb to return to"));
              Assert.IsTrue(extractedContent.Contains("educational technique but the law."));
              break;

            case 10:
              // page 1
              Assert.IsTrue(extractedContent.Contains("he fire at the Triangle Waist Company"));
              Assert.IsTrue(extractedContent.Contains("at the hands of industrial greed."));
              // page 2
              Assert.IsTrue(extractedContent.Contains("he Triangle Waist Company was in many ways"));
              Assert.IsTrue(extractedContent.Contains("unsafe working conditions on their employees."));
              // page 3 (last)
              Assert.IsTrue(extractedContent.Contains("mmediately after the fire, Triangle owners Blanck and Harris"));
              Assert.IsTrue(extractedContent.Contains("and that it was \"second to none in the country.\""));
              break;

            case 11:
              Assert.IsTrue(extractedContent.Contains("More than 20 percent of the world�s oxygen comes from the Amazon Rainforest."));
              Assert.IsTrue(extractedContent.Contains("practical ways to shrink the size of your step."));
              break;

            case 12:
              // Actual tumblr post
              Assert.IsTrue(extractedContent.Contains("First of all, you should watch this video."));
              // Next tumlbr post, linked from first - should not be included
              Assert.IsFalse(extractedContent.Contains("I�ll let Neil deGrasse Tyson set this up"));
              break;

            case 13:
              Assert.IsTrue(extractedContent.Contains("Back in 2003"));
              break;

            case 14:
              Assert.IsFalse(extractedContent.Contains("</body><a"), "Content found after </body>");
              break;

            case 15:
              Assert.IsFalse(extractedContent.Contains("</body><header>"), "Content found after </body>");
              break;

            case 16:
              {
            string sample = "It's the first day of school";
            int bodyStart = extractedContent.IndexOf("<body");
            int firstPageStart = extractedContent.IndexOf(sample, bodyStart);
            Assert.IsTrue(firstPageStart > -1);
            Assert.AreEqual(-1, extractedContent.IndexOf(sample, firstPageStart + sample.Length), "Article body repeated due to comment paging");
              }
              break;

            case 17:
              {
            string sample = "everybody should be treated equally";
            int firstPageStart = extractedContent.IndexOf(sample);
            Assert.IsTrue(firstPageStart > -1);
            Assert.AreEqual(-1, extractedContent.IndexOf(sample, firstPageStart + sample.Length), "Article body repeated due to conditional comment parsing");
              }
              break;

            case 18:
              {
            Assert.IsTrue(extractedContent.Contains("When Ben Franklin wrote"), "Missing start of text");
              }
              break;

            default:
              throw new NotSupportedException("Unknown sample input number (" + sampleInputNumber + "). Have you added another sample input? If so, then add appropriate asserts here as well.");
              }
        }
        public void TestSampleInputs([Values(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)]int sampleInputNumber)
        {
            const string outputDir = "SampleWebOutput";

              string sampleInputNumberStr = sampleInputNumber.ToString().PadLeft(2, '0');
              string[] urls = _Urls[sampleInputNumber];
              string initialUrl = urls[0];

              var fetcher = new FileBasedUrlFetcherStub(sampleInputNumber, urls);
              var nReadabilityTranscoder = new NReadabilityTranscoder();
              var nReadabilityWebTranscoder = new NReadabilityWebTranscoder(nReadabilityTranscoder, fetcher);

              var webTranscodingInput = new WebTranscodingInput(initialUrl);

              WebTranscodingResult webTranscodingResult = nReadabilityWebTranscoder.Transcode(webTranscodingInput);

              Assert.IsTrue(webTranscodingResult.ContentExtracted);

              if (!Directory.Exists(outputDir))
              {
            Directory.CreateDirectory(outputDir);
              }

              string extractedContent = webTranscodingResult.ExtractedContent;

              File.WriteAllText(
            Path.Combine(outputDir, string.Format("SampleOutput_{0}.html", sampleInputNumberStr)),
            extractedContent,
            Encoding.UTF8);

              switch (sampleInputNumber)
              {
            case 1:
              Assert.IsTrue(extractedContent.Contains(" freedom of movement or expression would constitute a new and unacceptable denial"));
              Assert.IsTrue(extractedContent.Contains("Those expectations were on display in the crowd outside her house on Saturday."));
              Assert.That(Regex.Matches(extractedContent, "Myanmar Junta Frees Dissident Daw Aung San Suu Kyi").Count, Is.EqualTo(4));
              break;

            case 2:
              Assert.IsTrue(extractedContent.Contains("For Louie and Phil, the conversations did more than keep their minds sharp."));
              Assert.IsTrue(extractedContent.Contains("It was absolutely dark and absolutely silent, save for the chattering of Phil�s teeth."));
              Assert.IsTrue(extractedContent.Contains("A serial runaway and artful dodger"));
              Assert.That(Regex.Matches(extractedContent, @"Adrift but Unbroken \| Politics").Count, Is.EqualTo(2));
              break;

            case 3:
              Assert.IsTrue(extractedContent.Contains("The Chinese system as a whole has great weaknesses as well as great strengths."));
              Assert.IsTrue(extractedContent.Contains(" This emphasis on limits is what begins pointing us back to coal."));
              Assert.IsTrue(extractedContent.Contains(". For example, the possibility of dramatic rises in ocean levels, which could affect the habitability"));
              Assert.That(Regex.Matches(extractedContent, "Dirty Coal, Clean Future - Magazine").Count, Is.EqualTo(3)); // Makes sure the title isn't duplicated
              break;

            case 4:  // Test duplicate content on subsequent page
              Assert.That(Regex.Matches(extractedContent, "his may seem paradoxical, or backward").Count, Is.EqualTo(1));
              break;

            case 5:
              // page 1
              Assert.IsTrue(extractedContent.Contains("The pressure's on, and as you glance back and forth between your rack and the board, you can hardly believe your eyes at the play you can make."));
              Assert.IsTrue(extractedContent.Contains("How can you take your game to the next level? Let's start by looking at game play."));
              // page 2
              Assert.IsTrue(extractedContent.Contains("The object of Scrabble is to get the most points by creating words."));
              Assert.IsTrue(extractedContent.Contains("Now that you know the parts of the game, let's take a look at how to play it."));
              // page 3
              Assert.IsTrue(extractedContent.Contains("To determine who goes first, put all the tiles into the bag and mix them up."));
              Assert.IsTrue(extractedContent.Contains("The game continues until one player uses all of his tiles and there aren't any in the pouch, or if there are no more tiles and no one can make a word. Add up the total of your unplayed tiles and deduct it from your score. If you've used all of your tiles, add the total of the unplayed tiles to your score. The winner has the most points."));
              // page 4
              Assert.IsTrue(extractedContent.Contains("If you play often enough, you'll need to learn how to play the board in order to get the highest score"));
              Assert.IsTrue(extractedContent.Contains("With the game's popularity, it now comes in many variations. Let's take a look at some different ways to play Scrabble."));
              // page 5
              Assert.IsTrue(extractedContent.Contains("Many people play Scrabble on a traditional flat board with the grid imprinted on it."));
              Assert.IsTrue(extractedContent.Contains("With its worldwide popularity, it only makes sense that Scrabble comes in languages other than English. "));
              break;

            case 6:
              // page 1
              Assert.IsTrue(extractedContent.Contains("In the aftermath of his resignation and then his death"));
              Assert.IsTrue(extractedContent.Contains("Curb Your Enthusiasm"));
              // page 2
              Assert.IsTrue(extractedContent.Contains("Jobs also seemed to suspect that he"));
              Assert.IsTrue(extractedContent.Contains("And, sadly, it may remain one forever."));
              break;

            case 7:
              // page 1
              Assert.IsTrue(extractedContent.Contains("post also betrays some misconceptions regarding our report."));
              Assert.IsTrue(extractedContent.Contains("After all, none of us can resist the occasional study"));
              // "page" 2 (false positive)
              Assert.IsFalse(extractedContent.Contains("In expressing this view, Clinton joins many Americans who worry about online misinformation, loss of privacy, and identity theft."));
              break;

            case 8:
              // page 1
              Assert.IsTrue(extractedContent.Contains("For the last couple of days we�ve been asking people"));
              Assert.IsTrue(extractedContent.Contains("list your favorite tools for slowing down feeds in the comments"));
              // "page" 2 (false positive)
              Assert.IsFalse(extractedContent.Contains("signature fake news programs"));
              break;

            case 9:
              // page 1
              Assert.IsTrue(extractedContent.Contains("The story is narrated by a young girl named Jean Louise"));
              Assert.IsTrue(extractedContent.Contains("toward adulthood."));
              // page 2
              Assert.IsTrue(extractedContent.Contains("September arrives, and Dill leaves Maycomb to return to"));
              Assert.IsTrue(extractedContent.Contains("educational technique but the law."));
              break;

            case 10:
              // page 1
              Assert.IsTrue(extractedContent.Contains("Curious about Native Client"));
              Assert.IsTrue(extractedContent.Contains("also known as the GLES2 Blue Book"));
              // page 2
              Assert.IsTrue(extractedContent.Contains("Most games written specifically for PC"));
              Assert.IsTrue(extractedContent.Contains("The things a good script should do"));
              // page 3
              Assert.IsTrue(extractedContent.Contains("The NaCl team is working hard on debugging"));
              Assert.IsTrue(extractedContent.Contains("Unfortunately this isn't really documented"));
              break;

            case 11:
              // page 1
              Assert.IsTrue(extractedContent.Contains("Sony press conference at Gamescom"));
              Assert.IsTrue(extractedContent.Contains("The guys can actually model inside the game"));
              // page 2
              Assert.IsTrue(extractedContent.Contains("You actually fold"));
              Assert.IsTrue(extractedContent.Contains("working on the skin shader right now"));
              // page 3
              Assert.IsTrue(extractedContent.Contains("It was the founding thing"));
              Assert.IsTrue(extractedContent.Contains("opportunities that you just did not have on the PS3"));
              break;

            default:
              throw new NotSupportedException("Unknown sample input number (" + sampleInputNumber + "). Have you added another sample input? If so, then add appropriate asserts here as well.");
              }
        }
示例#3
0
        public void TestSampleInputs([Values(1, 2, 3, 4, 5, 6, 7, 8)] int sampleInputNumber)
        {
            const string outputDir = "SampleWebOutput";

            string sampleInputNumberStr = sampleInputNumber.ToString().PadLeft(2, '0');

            string[] urls       = _Urls[sampleInputNumber - 1];
            string   initialUrl = urls[0];

            var fetcher = new FileBasedUrlFetcherStub(sampleInputNumber, urls);
            var _nReadabilityTranscoder    = new NReadabilityTranscoder();
            var _nReadabilityWebTranscoder = new NReadabilityWebTranscoder(_nReadabilityTranscoder, fetcher);

            bool mainContentExtracted;

            string transcodedContent =
                _nReadabilityWebTranscoder
                .Transcode(
                    initialUrl,
                    out mainContentExtracted);

            if (!Directory.Exists(outputDir))
            {
                Directory.CreateDirectory(outputDir);
            }

            File.WriteAllText(
                Path.Combine(outputDir, string.Format("SampleOutput_{0}.html", sampleInputNumberStr)),
                transcodedContent,
                Encoding.UTF8);

            switch (sampleInputNumber)
            {
            case 1:
                Assert.IsTrue(transcodedContent.Contains(" freedom of movement or expression would constitute a new and unacceptable denial"));
                Assert.IsTrue(transcodedContent.Contains("Those expectations were on display in the crowd outside her house on Saturday."));
                Assert.That(Regex.Matches(transcodedContent, "Myanmar Junta Frees Dissident Daw Aung San Suu Kyi").Count, Is.EqualTo(4));
                break;

            case 2:
                Assert.IsTrue(transcodedContent.Contains("For Louie and Phil, the conversations did more than keep their minds sharp."));
                Assert.IsTrue(transcodedContent.Contains("It was absolutely dark and absolutely silent, save for the chattering of Phil’s teeth."));
                Assert.IsTrue(transcodedContent.Contains("A serial runaway and artful dodger"));
                Assert.That(Regex.Matches(transcodedContent, @"Adrift but Unbroken \| Politics").Count, Is.EqualTo(2));
                break;

            case 3:
                Assert.IsTrue(transcodedContent.Contains("The Chinese system as a whole has great weaknesses as well as great strengths."));
                Assert.IsTrue(transcodedContent.Contains(" This emphasis on limits is what begins pointing us back to coal."));
                Assert.IsTrue(transcodedContent.Contains(". For example, the possibility of dramatic rises in ocean levels, which could affect the habitability"));
                Assert.That(Regex.Matches(transcodedContent, "Dirty Coal, Clean Future - Magazine").Count, Is.EqualTo(3)); // Makes sure the title isn't duplicated
                break;

            case 4: // Test duplicate content on subsequent page
                Assert.That(Regex.Matches(transcodedContent, "his may seem paradoxical, or backward").Count, Is.EqualTo(1));
                break;

            case 5:
                // page 1
                Assert.IsTrue(transcodedContent.Contains("The pressure's on, and as you glance back and forth between your rack and the board, you can hardly believe your eyes at the play you can make."));
                Assert.IsTrue(transcodedContent.Contains("How can you take your game to the next level? Let's start by looking at game play."));
                // page 2
                Assert.IsTrue(transcodedContent.Contains("The object of Scrabble is to get the most points by creating words."));
                Assert.IsTrue(transcodedContent.Contains("Now that you know the parts of the game, let's take a look at how to play it."));
                // page 3
                Assert.IsTrue(transcodedContent.Contains("To determine who goes first, put all the tiles into the bag and mix them up."));
                Assert.IsTrue(transcodedContent.Contains("The game continues until one player uses all of his tiles and there aren't any in the pouch, or if there are no more tiles and no one can make a word. Add up the total of your unplayed tiles and deduct it from your score. If you've used all of your tiles, add the total of the unplayed tiles to your score. The winner has the most points."));
                // page 4
                Assert.IsTrue(transcodedContent.Contains("If you play often enough, you'll need to learn how to play the board in order to get the highest score"));
                Assert.IsTrue(transcodedContent.Contains("With the game's popularity, it now comes in many variations. Let's take a look at some different ways to play Scrabble."));
                // page 5
                Assert.IsTrue(transcodedContent.Contains("Many people play Scrabble on a traditional flat board with the grid imprinted on it."));
                Assert.IsTrue(transcodedContent.Contains("With its worldwide popularity, it only makes sense that Scrabble comes in languages other than English. "));
                break;

            case 6:
                // page 1
                Assert.IsTrue(transcodedContent.Contains("In the aftermath of his resignation and then his death"));
                Assert.IsTrue(transcodedContent.Contains("Curb Your Enthusiasm"));
                // page 2
                Assert.IsTrue(transcodedContent.Contains("Jobs also seemed to suspect that he"));
                Assert.IsTrue(transcodedContent.Contains("And, sadly, it may remain one forever."));
                break;

            case 7:
                // page 1
                Assert.IsTrue(transcodedContent.Contains("post also betrays some misconceptions regarding our report."));
                Assert.IsTrue(transcodedContent.Contains("After all, none of us can resist the occasional study"));
                // "page" 2 (false positive)
                Assert.IsFalse(transcodedContent.Contains("In expressing this view, Clinton joins many Americans who worry about online misinformation, loss of privacy, and identity theft."));
                break;

            case 8:
                // page 1
                Assert.IsTrue(transcodedContent.Contains("For the last couple of days we've been asking people"));
                Assert.IsTrue(transcodedContent.Contains("list your favorite tools for slowing down feeds in the comments"));
                // "page" 2 (false positive)
                Assert.IsFalse(transcodedContent.Contains("signature fake news programs"));
                break;

            default:
                throw new NotSupportedException("Unknown sample input number (" + sampleInputNumber + "). Have you added another sample input? If so, then add appropriate asserts here as well.");
            }

            Assert.IsTrue(mainContentExtracted);
        }
示例#4
0
        public void TestSampleInputs([Values(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)] int sampleInputNumber)
        {
            const string outputDir = "SampleWebOutput";

            string sampleInputNumberStr = sampleInputNumber.ToString().PadLeft(2, '0');

            string[] urls       = _Urls[sampleInputNumber];
            string   initialUrl = urls[0];

            var fetcher = new FileBasedUrlFetcherStub(sampleInputNumber, urls);
            var nReadabilityTranscoder    = new NReadabilityTranscoder();
            var nReadabilityWebTranscoder = new NReadabilityWebTranscoder(nReadabilityTranscoder, fetcher);

            var webTranscodingInput = new WebTranscodingInput(initialUrl);

            WebTranscodingResult webTranscodingResult = nReadabilityWebTranscoder.Transcode(webTranscodingInput);

            Assert.IsTrue(webTranscodingResult.ContentExtracted);

            if (!Directory.Exists(outputDir))
            {
                Directory.CreateDirectory(outputDir);
            }

            string extractedContent = webTranscodingResult.ExtractedContent;

            File.WriteAllText(
                Path.Combine(outputDir, string.Format("SampleOutput_{0}.html", sampleInputNumberStr)),
                extractedContent,
                Encoding.UTF8);

            Assert.IsTrue(extractedContent.IndexOf("<html") == extractedContent.LastIndexOf("<html"));
            Assert.IsTrue(extractedContent.IndexOf("</html") == extractedContent.LastIndexOf("</html"));

            switch (sampleInputNumber)
            {
            case 1:
                Assert.IsTrue(extractedContent.Contains(" freedom of movement or expression would constitute a new and unacceptable denial"));
                Assert.IsTrue(extractedContent.Contains("Those expectations were on display in the crowd outside her house on Saturday."));
                Assert.That(Regex.Matches(extractedContent, "Myanmar Junta Frees Dissident Daw Aung San Suu Kyi").Count, Is.EqualTo(4));
                break;

            case 2:
                Assert.IsTrue(extractedContent.Contains("For Louie and Phil, the conversations did more than keep their minds sharp."));
                Assert.IsTrue(extractedContent.Contains("It was absolutely dark and absolutely silent, save for the chattering of Phil’s teeth."));
                Assert.IsTrue(extractedContent.Contains("A serial runaway and artful dodger"));
                Assert.That(Regex.Matches(extractedContent, @"Adrift but Unbroken \| Politics").Count, Is.EqualTo(2));
                break;

            case 3:
                Assert.IsTrue(extractedContent.Contains("The Chinese system as a whole has great weaknesses as well as great strengths."));
                Assert.IsTrue(extractedContent.Contains(" This emphasis on limits is what begins pointing us back to coal."));
                Assert.IsTrue(extractedContent.Contains(". For example, the possibility of dramatic rises in ocean levels, which could affect the habitability"));
                Assert.That(Regex.Matches(extractedContent, "Dirty Coal, Clean Future - Magazine").Count, Is.EqualTo(3)); // Makes sure the title isn't duplicated
                break;

            case 4: // Test duplicate content on subsequent page
                Assert.That(Regex.Matches(extractedContent, "his may seem paradoxical, or backward").Count, Is.EqualTo(1));
                break;

            case 5:
                // page 1
                Assert.IsTrue(extractedContent.Contains("The pressure's on, and as you glance back and forth between your rack and the board, you can hardly believe your eyes at the play you can make."));
                Assert.IsTrue(extractedContent.Contains("How can you take your game to the next level? Let's start by looking at game play."));
                // page 2
                Assert.IsTrue(extractedContent.Contains("The object of Scrabble is to get the most points by creating words."));
                Assert.IsTrue(extractedContent.Contains("Now that you know the parts of the game, let's take a look at how to play it."));
                // page 3
                Assert.IsTrue(extractedContent.Contains("To determine who goes first, put all the tiles into the bag and mix them up."));
                Assert.IsTrue(extractedContent.Contains("The game continues until one player uses all of his tiles and there aren't any in the pouch, or if there are no more tiles and no one can make a word. Add up the total of your unplayed tiles and deduct it from your score. If you've used all of your tiles, add the total of the unplayed tiles to your score. The winner has the most points."));
                // page 4
                Assert.IsTrue(extractedContent.Contains("If you play often enough, you'll need to learn how to play the board in order to get the highest score"));
                Assert.IsTrue(extractedContent.Contains("With the game's popularity, it now comes in many variations. Let's take a look at some different ways to play Scrabble."));
                // page 5
                Assert.IsTrue(extractedContent.Contains("Many people play Scrabble on a traditional flat board with the grid imprinted on it."));
                Assert.IsTrue(extractedContent.Contains("With its worldwide popularity, it only makes sense that Scrabble comes in languages other than English. "));
                break;

            case 6:
                // page 1
                Assert.IsTrue(extractedContent.Contains("In the aftermath of his resignation and then his death"));
                Assert.IsTrue(extractedContent.Contains("Curb Your Enthusiasm"));
                // page 2
                Assert.IsTrue(extractedContent.Contains("Jobs also seemed to suspect that he"));
                Assert.IsTrue(extractedContent.Contains("And, sadly, it may remain one forever."));
                break;

            case 7:
                // page 1
                Assert.IsTrue(extractedContent.Contains("post also betrays some misconceptions regarding our report."));
                Assert.IsTrue(extractedContent.Contains("After all, none of us can resist the occasional study"));
                // "page" 2 (false positive)
                Assert.IsFalse(extractedContent.Contains("In expressing this view, Clinton joins many Americans who worry about online misinformation, loss of privacy, and identity theft."));
                break;

            case 8:
                // page 1
                Assert.IsTrue(extractedContent.Contains("For the last couple of days we've been asking people"));
                Assert.IsTrue(extractedContent.Contains("list your favorite tools for slowing down feeds in the comments"));
                // "page" 2 (false positive)
                Assert.IsFalse(extractedContent.Contains("signature fake news programs"));
                break;

            case 9:
                // page 1
                Assert.IsTrue(extractedContent.Contains("The story is narrated by a young girl named Jean Louise"));
                Assert.IsTrue(extractedContent.Contains("toward adulthood."));
                // page 2
                Assert.IsTrue(extractedContent.Contains("September arrives, and Dill leaves Maycomb to return to"));
                Assert.IsTrue(extractedContent.Contains("educational technique but the law."));
                break;

            case 10:
                // page 1
                Assert.IsTrue(extractedContent.Contains("he fire at the Triangle Waist Company"));
                Assert.IsTrue(extractedContent.Contains("at the hands of industrial greed."));
                // page 2
                Assert.IsTrue(extractedContent.Contains("he Triangle Waist Company was in many ways"));
                Assert.IsTrue(extractedContent.Contains("unsafe working conditions on their employees."));
                // page 3 (last)
                Assert.IsTrue(extractedContent.Contains("mmediately after the fire, Triangle owners Blanck and Harris"));
                Assert.IsTrue(extractedContent.Contains("and that it was \"second to none in the country.\""));
                break;

            case 11:
                Assert.IsTrue(extractedContent.Contains("More than 20 percent of the world’s oxygen comes from the Amazon Rainforest."));
                Assert.IsTrue(extractedContent.Contains("practical ways to shrink the size of your step."));
                break;

            case 12:
                // Actual tumblr post
                Assert.IsTrue(extractedContent.Contains("First of all, you should watch this video."));
                // Next tumlbr post, linked from first - should not be included
                Assert.IsFalse(extractedContent.Contains("I’ll let Neil deGrasse Tyson set this up"));
                break;

            case 13:
                Assert.IsTrue(extractedContent.Contains("Back in 2003"));
                break;

            case 14:
                Assert.IsFalse(extractedContent.Contains("</body><a"), "Content found after </body>");
                break;

            case 15:
                Assert.IsFalse(extractedContent.Contains("</body><header>"), "Content found after </body>");
                break;

            case 16:
            {
                string sample         = "It's the first day of school";
                int    bodyStart      = extractedContent.IndexOf("<body");
                int    firstPageStart = extractedContent.IndexOf(sample, bodyStart);
                Assert.IsTrue(firstPageStart > -1);
                Assert.AreEqual(-1, extractedContent.IndexOf(sample, firstPageStart + sample.Length), "Article body repeated due to comment paging");
            }
            break;

            case 17:
            {
                string sample         = "everybody should be treated equally";
                int    firstPageStart = extractedContent.IndexOf(sample);
                Assert.IsTrue(firstPageStart > -1);
                Assert.AreEqual(-1, extractedContent.IndexOf(sample, firstPageStart + sample.Length), "Article body repeated due to conditional comment parsing");
            }
            break;

            case 18:
            {
                Assert.IsTrue(extractedContent.Contains("When Ben Franklin wrote"), "Missing start of text");
            }
            break;

            default:
                throw new NotSupportedException("Unknown sample input number (" + sampleInputNumber + "). Have you added another sample input? If so, then add appropriate asserts here as well.");
            }
        }