Esempio n. 1
0
        private void ProcessByPage(string pFilePath)
        {
            string fileNameWithoutExtension = Path.GetFileNameWithoutExtension(pFilePath);
            string caminhoPdf = Path.Combine(txbDirSaida.Text, fileNameWithoutExtension);                //Caminho onde ira salvar o pdf com OCR sem informar extensao

            using (IResultRenderer render = ResultRenderer.CreatePdfRenderer(caminhoPdf, _tessDataPath)) //Caminho pdf e caminho para a fonte do pdf
                using (PixArray pages = PixArray.LoadMultiPageTiffFromFile(pFilePath))                   //Carrega todas as páginas do tiff
                    using (render.BeginDocument(fileNameWithoutExtension))                               //Cria o pdf
                    {
                        List <Task>            lstTasksRunning   = new List <Task>();
                        SortedList <int, Page> lstProcessedPages = new SortedList <int, Page>();
                        int currentPage = 0;
                        foreach (Pix page in pages)
                        {
                            int pageIndex = currentPage;
                            lstTasksRunning.Add(Task.Factory.StartNew(() =>
                            {
                                using (TesseractEngine tesseract = new TesseractEngine(_tessDataPath, "por")) //Caminho da pasta com arquivos de config/ idioma do OCR
                                {
                                    Page pagina = tesseract.Process(page, Path.GetFileNameWithoutExtension(fileNameWithoutExtension));
                                    lstProcessedPages.Add(pageIndex, pagina); //Processa o arquivo podendo retirar as informacoes de OCR etc.
                                }
                            }));
                            currentPage++;
                        }

                        Task.WaitAll(lstTasksRunning.ToArray());
                        foreach (var pageKeyValue in lstProcessedPages)
                        {
                            render.AddPage(pageKeyValue.Value); //Adiciona a pagina
                        }
                    }
        }
Esempio n. 2
0
        public override void ProcessFile(string filename)
        {
            List <IResultRenderer> resultRenderers = new List <IResultRenderer>();

            switch (OutputFormat)
            {
            case "text":
                resultRenderers.Add(ResultRenderer.CreateTextRenderer(OutputFile));
                break;

            case "hocr":
                resultRenderers.Add(ResultRenderer.CreateHOcrRenderer(OutputFile));
                break;

            case "pdf":
                resultRenderers.Add(ResultRenderer.CreatePdfRenderer(OutputFile, Datapath + "\\tessdata"));
                break;
            }

            using (IResultRenderer renderer = new AggregateResultRenderer(resultRenderers))
            {
                if (filename.EndsWith(".tif"))
                {
                    ProcessTiffFile(renderer, filename);
                }
                else
                {
                    ProcessFile(renderer, filename);
                }
            }
        }
 /// <summary>
 /// Converts the given input TIF file to searchable PDF using tesseract engine.
 /// </summary>
 /// <param name="inputFilePath">TIF file path</param>
 /// <param name="outputFilePath">Searchable PDF file path</param>
 public static void Convert(string inputFilePath, string outputFilePath)
 {
     using (var renderer = ResultRenderer.CreatePdfRenderer(outputFilePath, TesseractData))
     {
         ProcessImageFile(renderer, inputFilePath);
         Console.WriteLine("Conversion completed for file: " + inputFilePath);
     }
 }
Esempio n. 4
0
File: Ocr.cs Progetto: yrrebpsar/ocr
        public void Scan(string fullPath, bool force = false)
        {
            var file    = System.IO.Path.GetFileNameWithoutExtension(fullPath);
            var dstPath = _destination ?? System.IO.Path.GetDirectoryName(fullPath);
            var dstFile = System.IO.Path.Combine(dstPath, $"{file}.ocr");

            if (!File.Exists(dstFile + ".pdf") || force)
            {
                Console.WriteLine($"Scanning {file}");
                using (var pdfReader = new PdfReader(fullPath))
                {
                    var  parser       = new PdfReaderContentParser(pdfReader);
                    var  extractor    = new ImageExtractor(dstPath);
                    var  dllDir       = AppDomain.CurrentDomain.BaseDirectory;
                    bool containsText = false;
                    using (var engine = new TesseractEngine($"{dllDir}/tessdata", "deu", EngineMode.Default))
                    {
                        using (var pdf = ResultRenderer.CreatePdfRenderer(dstFile, @"./tessdata"))
                        {
                            pdf.BeginDocument(file);
                            for (int i = 1; i <= pdfReader.NumberOfPages; i++)
                            {
                                extractor.Rotation = pdfReader.GetPageRotation(i);
                                parser.ProcessContent(i, extractor);
                                var tempFile = extractor.TempFile;
                                containsText |= extractor.ContainsText;
                                if (containsText)
                                {
                                    break;      // Don't process files that contain text.
                                }
                                using (var img = Pix.LoadFromFile(tempFile))
                                {
                                    Console.WriteLine($"Scanning page {i}");
                                    using (var page = engine.Process(img, $"page-{i}"))
                                    {
                                        pdf.AddPage(page);
                                    }
                                }
                                File.Delete(tempFile);
                            }
                        }
                    }

                    // Don't duplicate files that contain text.
                    if (containsText)
                    {
                        Console.WriteLine($"Skipping {file}, as it contains text.");
                        File.Delete(dstFile + ".pdf");
                    }
                }
            }
        }
        public void CanRenderMultiplePageDocumentToPdfFile()
        {
            var resultPath = TestResultRunFile(@"ResultRenderers\PDF\multi-page");

            using (var renderer = ResultRenderer.CreatePdfRenderer(resultPath, DataPath)) {
                var examplePixPath = TestFilePath("processing/multi-page.tif");
                ProcessMultipageTiff(renderer, examplePixPath);
            }

            var expectedOutputFilename = Path.ChangeExtension(resultPath, "pdf");

            Assert.That(File.Exists(expectedOutputFilename), $"Expected a PDF file \"{expectedOutputFilename}\" to have been created; but none was found.");
        }
        public void CanRenderResultsIntoPdfFile()
        {
            var resultPath = TestResultRunFile(@"ResultRenderers\PDF\phototest");

            using (var renderer = ResultRenderer.CreatePdfRenderer(resultPath, DataPath)) {
                var examplePixPath = TestFilePath("Ocr/phototest.tif");
                ProcessFile(renderer, examplePixPath);
            }

            var expectedOutputFilename = Path.ChangeExtension(resultPath, "pdf");

            Assert.That(File.Exists(expectedOutputFilename), $"Expected a PDF file \"{expectedOutputFilename}\" to have been created; but none was found.");
        }
        public void CanRenderResultsIntoBoxFile()
        {
            var resultPath = TestResultRunFile(@"ResultRenderers\Box\phototest");

            using (var renderer = ResultRenderer.CreateBoxRenderer(resultPath))
            {
                var examplePixPath = TestFilePath("Ocr/phototest.tif");
                ProcessFile(renderer, examplePixPath);
            }

            var expectedOutputFilename = Path.ChangeExtension(resultPath, "box");

            Assert.IsTrue(File.Exists(expectedOutputFilename), $"Expected a Box file \"{expectedOutputFilename}\" to have been created; but none was found.");
        }
        public void CanRenderMultiplePageDocumentIntoMultipleResultRenderers()
        {
            var resultPath = TestResultRunFile(@"ResultRenderers\Aggregate\multi-page");

            using (var renderer = new AggregateResultRenderer(ResultRenderer.CreatePdfRenderer(resultPath, DataPath), ResultRenderer.CreateTextRenderer(resultPath))) {
                var examplePixPath = TestFilePath("processing/multi-page.tif");
                ProcessMultipageTiff(renderer, examplePixPath);
            }

            var expectedPdfOutputFilename = Path.ChangeExtension(resultPath, "pdf");

            Assert.That(File.Exists(expectedPdfOutputFilename), $"Expected a PDF file \"{expectedPdfOutputFilename}\" to have been created; but non was found.");

            var expectedTxtOutputFilename = Path.ChangeExtension(resultPath, "txt");

            Assert.That(File.Exists(expectedTxtOutputFilename), $"Expected a Text file \"{expectedTxtOutputFilename}\" to have been created; but non was found.");
        }
Esempio n. 9
0
 private void ProcessByFile(string pFilePath)
 {
     using (TesseractEngine tesseract = new TesseractEngine(_tessDataPath, "por")) //Caminho da pasta com arquivos de config/ idioma do OCR
     {
         string fileNameWithoutExtension = Path.GetFileNameWithoutExtension(pFilePath);
         string caminhoPdf = Path.Combine(txbDirSaida.Text, fileNameWithoutExtension);                //Caminho onde ira salvar o pdf com OCR sem informar extensao
         using (IResultRenderer render = ResultRenderer.CreatePdfRenderer(caminhoPdf, _tessDataPath)) //Caminho pdf e caminho para a fonte do pdf
             using (PixArray pages = PixArray.LoadMultiPageTiffFromFile(pFilePath))                   //Carrega todas as páginas do tiff
                 using (render.BeginDocument(fileNameWithoutExtension))                               //Cria o pdf
                 {
                     foreach (Pix page in pages)
                     {
                         using (page)
                             using (Page processedPage = tesseract.Process(page, Path.GetFileNameWithoutExtension(fileNameWithoutExtension))) //Processa o arquivo podendo retirar as informacoes de OCR etc.
                                 render.AddPage(processedPage);                                                                               //Adiciona a pagina
                     }
                 }
     }
 }
        public void CanRenderResultsIntoMultipleOutputFormats()
        {
            var resultPath = TestResultRunFile(@"ResultRenderers\PDF\phototest");
            List <RenderedFormat> formats = new List <RenderedFormat> {
                RenderedFormat.HOCR, RenderedFormat.PDF, RenderedFormat.TEXT
            };

            using (var renderer = ResultRenderer.CreateRenderers(resultPath, DataPath, formats))
            {
                var examplePixPath = TestFilePath("Ocr/phototest.tif");
                ProcessFile(renderer, examplePixPath);
            }

            var expectedOutputFilename = Path.ChangeExtension(resultPath, "pdf");

            Assert.That(File.Exists(expectedOutputFilename), $"Expected a PDF file \"{expectedOutputFilename}\" to have been created; but none was found.");
            expectedOutputFilename = Path.ChangeExtension(resultPath, "hocr");
            Assert.That(File.Exists(expectedOutputFilename), $"Expected a HOCR file \"{expectedOutputFilename}\" to have been created; but none was found.");
            expectedOutputFilename = Path.ChangeExtension(resultPath, "txt");
            Assert.That(File.Exists(expectedOutputFilename), $"Expected a TEXT file \"{expectedOutputFilename}\" to have been created; but none was found.");
        }
Esempio n. 11
0
 public MainSceneController()
 {
     this.configs    = new Configs();
     this.materials  = new Materials(configs);
     this.characters = new Characters(configs);
     new BankRenderer(this.configs.LeftBankPos, this.materials, this.configs);
     new BankRenderer(this.configs.RightBankPos, this.materials, this.configs);
     this.boatMove = new LinearMove2D(this.configs.BoatMoveSpeed);
     this.boat     = new BoatRenderer(this.configs.RightBoatPos, this.materials, this.boatMove.Clone(), this, this.configs);
     this.priests  = new PriestRenderer[this.configs.PeopleNum];
     this.evils    = new EvilRenderer[this.configs.PeopleNum];
     this.charMove = new ParabolicMove2D(this.configs.CharMoveSummit, this.configs.CharMoveSpeed);
     for (int i = 0; i < this.configs.PeopleNum; i++)
     {
         this.priests[i] = new PriestRenderer(i, this.configs.RightCharPos[i + 2], materials, this.charMove.Clone(), this, this.configs);
     }
     for (int i = 0; i < this.configs.PeopleNum; i++)
     {
         this.evils[i] = new EvilRenderer(i, this.configs.RightCharPos[i + 2 + this.configs.PeopleNum], materials, this.charMove.Clone(), this, this.configs);
     }
     this.result = new ResultRenderer(this.configs.ResultPlanePos, this.configs.ResultPlaneRotation, this.configs.ResultPlaneScale, this.materials);
 }
Esempio n. 12
0
        public override void ProcessFile(string filename)
        {
            List <IResultRenderer> resultRenderers = new List <IResultRenderer>();

            switch (OutputFormat)
            {
            case "text":
                resultRenderers.Add(ResultRenderer.CreateTextRenderer(OutputFile));
                break;

            case "hocr":
                resultRenderers.Add(ResultRenderer.CreateHOcrRenderer(OutputFile));
                break;

            case "pdf":
                resultRenderers.Add(ResultRenderer.CreatePdfRenderer(OutputFile, Datapath, false));
                break;
            }

            using (IResultRenderer renderer = new AggregateResultRenderer(resultRenderers))
            {
                ProcessImageFile(renderer, filename);
            }
        }
Esempio n. 13
0
        public static void Main(string[] args)
        {
            var testImagePath = "./closeup_scan.png";//"./phototest.tif";

            if (args.Length > 0)
            {
                testImagePath = args[0];
            }

            try
            {
                using (IResultRenderer renderer = ResultRenderer.CreatePdfRenderer(@"./output", @"./tessdata"))
                {
                    using (renderer.BeginDocument("PDF Test"))
                    {
                        using (TesseractEngine engine = new TesseractEngine(@"./tessdata", "eng", EngineMode.TesseractAndCube))
                        {
                            var list = Directory.GetFiles(@"images");
                            foreach (var item in list)
                            {
                                using (var tifFile = new Bitmap(item))
                                {
                                    //using (var img = PixConverter.ToPix(tifFile))
                                    //{
                                    using (var page = engine.Process(tifFile, "test"))
                                    {
                                        renderer.AddPage(page);
                                    }
                                    //}
                                }
                            }
                        }
                    }
                }
                //using (var engine = new TesseractEngine(@"./tessdata", "eng", EngineMode.Default))
                //{
                //    using (var img = Pix.LoadFromFile(testImagePath))
                //    {
                //        using (var page = engine.Process(img))
                //        {
                //            var text = page.GetText();
                //            Console.WriteLine("Mean confidence: {0}", page.GetMeanConfidence());

                //            Console.WriteLine("Text (GetText): \r\n{0}", text);
                //            Console.WriteLine("Text (iterator):");
                //            using (var iter = page.GetIterator())
                //            {
                //                iter.Begin();

                //                do
                //                {
                //                    do
                //                    {
                //                        do
                //                        {
                //                            do
                //                            {
                //                                if (iter.IsAtBeginningOf(PageIteratorLevel.Block))
                //                                {
                //                                    Console.WriteLine("<BLOCK>");
                //                                }

                //                                Console.Write(iter.GetText(PageIteratorLevel.Word));
                //                                Console.Write(" ");

                //                                if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word))
                //                                {
                //                                    Console.WriteLine();
                //                                }
                //                            } while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));

                //                            if (iter.IsAtFinalOf(PageIteratorLevel.Para, PageIteratorLevel.TextLine))
                //                            {
                //                                Console.WriteLine();
                //                            }
                //                        } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
                //                    } while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
                //                } while (iter.Next(PageIteratorLevel.Block));
                //            }
                //        }
                //    }
                //}
            }
            catch (Exception e)
            {
                Trace.TraceError(e.ToString());
                Console.WriteLine("Unexpected Error: " + e.Message);
                Console.WriteLine("Details: ");
                Console.WriteLine(e.ToString());
            }
            Console.Write("Press any key to continue . . . ");
            Console.ReadKey(true);
        }