void ObjButtonGO_Click(object sender, RoutedEventArgs e)
        {
            int pdf_number  = Convert.ToInt32(ObjTextDoc.Text);
            int page_number = Convert.ToInt32(ObjTextPage.Text);

            string pdf_filename = String.Format(@"C:\temp\{0}.pdf", pdf_number);

            Logging.Info("+Rendering page");
            MemoryStream ms           = MuPDFRenderer.RenderPDFPage(pdf_filename, page_number, 200, null, ProcessPriorityClass.Normal);
            BitmapSource bitmap_image = BitmapImageTools.LoadFromBytes(ms.ToArray());
            Bitmap       bitmap       = new Bitmap(ms);

            Logging.Info("-Rendering page");

            this.Image = bitmap_image;

            Logging.Info("+Finding regions");
            this.region_locator = new PDFRegionLocator(bitmap);
            Logging.Info("-Finding regions");

            Recalc();
        }
        public static WordList DoOCR(string pdf_filename, int page_number)
        {
            Logging.Info("+Rendering page {1} for PDF file {0}", pdf_filename, page_number);
            using (MemoryStream ms = new MemoryStream(SoraxPDFRenderer.GetPageByDPIAsImage(pdf_filename, pdf_user_password, page_number, 200)))
            {
                Bitmap bitmap = (Bitmap)Image.FromStream(ms);

                Logging.Info("-Rendering page #{0}", page_number);

                Logging.Info("Startup directory is {0}", Environment.CurrentDirectory);
                Logging.Info("Language is '{0}'", language);

                using (Tesseract ocr = new Tesseract())
                {
                    ocr.Init(null, language, false);

                    Logging.Info("+Doing OCR");

                    const int MIN_WIDTH = 0;

                    // Build a list of all the rectangles to process
                    PDFRegionLocator        pdf_region_locator = new PDFRegionLocator(bitmap);
                    PDFRegionLocator.Region last_region        = pdf_region_locator.regions[0];
                    List <Rectangle>        rectangles         = new List <Rectangle>();
                    Rectangle last_rectangle = new Rectangle();
                    foreach (PDFRegionLocator.Region region in pdf_region_locator.regions)
                    {
                        int  rect_height     = region.y - last_region.y;
                        bool alarming_height = (rect_height <= 0);

                        Rectangle rectangle = new Rectangle();

                        if (last_region.state == PDFRegionLocator.SegmentState.BLANKS)
                        {
                            // LHS
                            {
                                rectangle = new Rectangle(0, last_region.y, bitmap.Width / 2, Math.Max(MIN_WIDTH, rect_height));
                            }
                            // RHS
                            {
                                rectangle = new Rectangle(bitmap.Width / 2, last_region.y, bitmap.Width / 2, Math.Max(MIN_WIDTH, rect_height));
                            }
                        }
                        else if (last_region.state == PDFRegionLocator.SegmentState.PIXELS)
                        {
                            // Full column
                            {
                                rectangle = new Rectangle(0, last_region.y, bitmap.Width, Math.Max(MIN_WIDTH, rect_height));
                            }
                        }

                        if (alarming_height || rectangle.Height <= 0)
                        {
                            Logging.Warn("Calculated region height is negative or zero: {0} :: Calculated region {1} <-- CURRENT:{2} - LAST:{3}", rect_height, rectangle, region, last_region);

                            // skip rectangle
                        }
                        else if (last_rectangle.X == rectangle.X && last_rectangle.Y == rectangle.Y)
                        {
                            Logging.Warn("Overlapping subsequent rectangles will be merged :: CURRENT:{0} - LAST:{1}", rectangle, last_rectangle);
                            last_rectangle.Width  = Math.Max(last_rectangle.Width, rectangle.Width);
                            last_rectangle.Height = Math.Max(last_rectangle.Height, rectangle.Height);
                            Logging.Warn("--> Updated 'last' rectangle:{0}", last_rectangle);
                        }
                        else
                        {
                            rectangles.Add(rectangle);
                            last_rectangle = rectangle;
                        }

                        last_region = region;
                    }

                    // DEBUG CODE: Draw in the region rectangles
                    //
                    // When we run in NOKILL mode, we "know" we're running in a debugger or stand-alone environment
                    // intended for testing this code. Hence we should dump the regions image as part of the process.
                    if (no_kill)
                    {
                        string bitmap_diag_path = pdf_filename + @"." + page_number + @"-ocr.png";

                        Logging.Info("Dumping regions-augmented page {0} PNG image to file {1}", page_number, bitmap_diag_path);
                        Graphics g = Graphics.FromImage(bitmap);
                        foreach (Rectangle rectangle in rectangles)
                        {
                            if (rectangle.Width <= MIN_WIDTH && rectangle.Height > MIN_WIDTH)
                            {
                                DrawRectangleOutline(g, Pens.Purple, rectangle);
                            }
                            else if (rectangle.Width > MIN_WIDTH && rectangle.Height <= MIN_WIDTH)
                            {
                                DrawRectangleOutline(g, Pens.PowderBlue, rectangle);
                            }
                            else if (rectangle.Width <= MIN_WIDTH && rectangle.Height <= MIN_WIDTH)
                            {
                                DrawRectangleOutline(g, Pens.Red, rectangle);
                            }
                            else
                            {
                                DrawRectangleOutline(g, Pens.LawnGreen, rectangle);
                            }
                        }

                        bitmap.Save(bitmap_diag_path, ImageFormat.Png);
                    }

                    // Do the OCR on each of the rectangles
                    WordList word_list = new WordList();
                    foreach (Rectangle rectangle in rectangles)
                    {
                        if (0 == rectangle.Width || 0 == rectangle.Height)
                        {
                            Logging.Info("Skipping zero extent rectangle {0}", rectangle.ToString());
                            continue;
                        }

                        Logging.Info("Doing OCR for region {0} on bitmap WxH: {1}x{2}", rectangle.ToString(), bitmap.Width, bitmap.Height);
                        List <Word> result = ocr.DoOCR(bitmap, rectangle);
                        Logging.Info("Got {0} words", result.Count);
                        word_list.AddRange(ConvertToWordList(result, rectangle, bitmap));
                    }

                    Logging.Info("-Doing OCR");


                    Logging.Info("Found {0} words ({1} @ #{2})", word_list.Count, pdf_filename, page_number);

#if false
                    Logging.Info("+Reordering words for columns");
                    WordList word_list_ordered = ColumnWordOrderer.ReorderWords(word_list);
                    Logging.Info("-Reordering words for columns");
                    word_list_ordered.WriteToFile(ocr_output_filename);
#endif

                    return(word_list);
                }
            }
        }
Exemple #3
0
        public static WordList DoOCR(string pdf_filename, int page_number)
        {
            Logging.Info("+Rendering page");
            SoraxPDFRenderer renderer = new SoraxPDFRenderer(pdf_filename, pdf_user_password, pdf_user_password);
            Bitmap           bitmap   = (Bitmap)Image.FromStream(new MemoryStream(renderer.GetPageByDPIAsImage(page_number, 200)));

            Logging.Info("-Rendering page");

            Logging.Info("Startup directory is {0}", Environment.CurrentDirectory);
            Logging.Info("Language is '{0}'", language);

            Tesseract ocr = new Tesseract();

            ocr.Init(null, language, false);

            Logging.Info("+Doing OCR");

            // Build a list of all the rectangles to process
            PDFRegionLocator pdf_region_locator = new PDFRegionLocator(bitmap);

            PDFRegionLocator.Region last_region = pdf_region_locator.regions[0];
            List <Rectangle>        rectangles  = new List <Rectangle>();

            foreach (PDFRegionLocator.Region region in pdf_region_locator.regions)
            {
                if (false)
                {
                }
                else if (last_region.state == PDFRegionLocator.SegmentState.BLANKS)
                {
                    // LHS
                    {
                        Rectangle rectangle = new Rectangle(0, last_region.y, bitmap.Width / 2, region.y - last_region.y);
                        rectangles.Add(rectangle);
                    }
                    // RHS
                    {
                        Rectangle rectangle = new Rectangle(bitmap.Width / 2, last_region.y, bitmap.Width / 2, region.y - last_region.y);
                        rectangles.Add(rectangle);
                    }
                }
                else if (last_region.state == PDFRegionLocator.SegmentState.PIXELS)
                {
                    // Full column
                    {
                        Rectangle rectangle = new Rectangle(0, last_region.y, bitmap.Width, region.y - last_region.y);
                        rectangles.Add(rectangle);
                    }
                }

                last_region = region;
            }

            // DEBUG CODE: Draw in the region rectangles
            //{
            //    Graphics g = Graphics.FromImage(bitmap);
            //    foreach (Rectangle rectangle in rectangles)
            //    {
            //        g.DrawRectangle(Pens.Black, rectangle);
            //    }

            //    bitmap.Save(@"C:\temp\aaaaaa.png", ImageFormat.Png);
            //}

            // Do the OCR on each of the rectangles
            WordList word_list = new WordList();

            foreach (Rectangle rectangle in rectangles)
            {
                if (0 == rectangle.Width || 0 == rectangle.Height)
                {
                    Logging.Info("Skipping zero extent rectangle {0}", rectangle.ToString());
                    continue;
                }

                Logging.Info("Doing OCR for region {0}", rectangle.ToString());
                List <Word> result = ocr.DoOCR(bitmap, rectangle);
                Logging.Info("Got {0} words", result.Count);
                word_list.AddRange(ConvertToWordList(result, rectangle, bitmap));
            }

            Logging.Info("-Doing OCR");


            Logging.Info("Found {0} words", word_list.Count);

            //Logging.Info("+Reordering words for columns");
            //WordList word_list_ordered = ColumnWordOrderer.ReorderWords(word_list);
            //Logging.Info("-Reordering words for columns");
            //word_list_ordered.WriteToFile(ocr_output_filename);

            return(word_list);
        }