Esempio n. 1
0
        public void TestExtractTextAndWash()
        {
            var pageNumber = 737;

            for (int i = pageNumber; i <= 970; i++)
            {
                var sourcePath   = @"c:\code\PdfTranslator\Solution1\PageExtractor.Tests\bin\Debug\Extracted\";
                var pdfFilename  = "Page" + i + "_Iosephi_Scaligeri_Opus_de_emendatione_te.pdf";
                var outFilename  = "Page" + i.ToString("00#") + "_cleanedText_Iosephi_Scaligeri_Opus_de_emendatione_te.txt";
                var outDirectory = @"c:\code\PdfTranslator\Solution1\Output\Latin\Raw\";
                var te           = new TextExtractor();
                var textContents = te.Extract(Path.Combine(sourcePath, pdfFilename));

                Assert.IsNotNull(textContents);

                var pe = new PageFormatter()
                {
                    OriginalText = textContents.Text
                };

                var correctedText = pe.FixCommonOCRErrors();

                Assert.IsFalse(correctedText.Contains(" fed "));

                File.WriteAllText(Path.Combine(outDirectory, outFilename), correctedText);
            }
        }