void ObjButtonGO_Click(object sender, RoutedEventArgs e) { int pdf_number = Convert.ToInt32(ObjTextDoc.Text); int page_number = Convert.ToInt32(ObjTextPage.Text); string pdf_filename = String.Format(@"C:\temp\{0}.pdf", pdf_number); Logging.Info("+Rendering page"); MemoryStream ms = MuPDFRenderer.RenderPDFPage(pdf_filename, page_number, 200, null, ProcessPriorityClass.Normal); BitmapSource bitmap_image = BitmapImageTools.LoadFromBytes(ms.ToArray()); Bitmap bitmap = new Bitmap(ms); Logging.Info("-Rendering page"); this.Image = bitmap_image; Logging.Info("+Finding regions"); this.region_locator = new PDFRegionLocator(bitmap); Logging.Info("-Finding regions"); Recalc(); }
public static WordList DoOCR(string pdf_filename, int page_number) { Logging.Info("+Rendering page {1} for PDF file {0}", pdf_filename, page_number); using (MemoryStream ms = new MemoryStream(SoraxPDFRenderer.GetPageByDPIAsImage(pdf_filename, pdf_user_password, page_number, 200))) { Bitmap bitmap = (Bitmap)Image.FromStream(ms); Logging.Info("-Rendering page #{0}", page_number); Logging.Info("Startup directory is {0}", Environment.CurrentDirectory); Logging.Info("Language is '{0}'", language); using (Tesseract ocr = new Tesseract()) { ocr.Init(null, language, false); Logging.Info("+Doing OCR"); const int MIN_WIDTH = 0; // Build a list of all the rectangles to process PDFRegionLocator pdf_region_locator = new PDFRegionLocator(bitmap); PDFRegionLocator.Region last_region = pdf_region_locator.regions[0]; List <Rectangle> rectangles = new List <Rectangle>(); Rectangle last_rectangle = new Rectangle(); foreach (PDFRegionLocator.Region region in pdf_region_locator.regions) { int rect_height = region.y - last_region.y; bool alarming_height = (rect_height <= 0); Rectangle rectangle = new Rectangle(); if (last_region.state == PDFRegionLocator.SegmentState.BLANKS) { // LHS { rectangle = new Rectangle(0, last_region.y, bitmap.Width / 2, Math.Max(MIN_WIDTH, rect_height)); } // RHS { rectangle = new Rectangle(bitmap.Width / 2, last_region.y, bitmap.Width / 2, Math.Max(MIN_WIDTH, rect_height)); } } else if (last_region.state == PDFRegionLocator.SegmentState.PIXELS) { // Full column { rectangle = new Rectangle(0, last_region.y, bitmap.Width, Math.Max(MIN_WIDTH, rect_height)); } } if (alarming_height || rectangle.Height <= 0) { Logging.Warn("Calculated region height is negative or zero: {0} :: Calculated region {1} <-- CURRENT:{2} - LAST:{3}", rect_height, rectangle, region, last_region); // skip rectangle } else if (last_rectangle.X == rectangle.X && last_rectangle.Y == rectangle.Y) { Logging.Warn("Overlapping subsequent rectangles will be merged :: CURRENT:{0} - LAST:{1}", rectangle, last_rectangle); last_rectangle.Width = Math.Max(last_rectangle.Width, rectangle.Width); last_rectangle.Height = Math.Max(last_rectangle.Height, rectangle.Height); Logging.Warn("--> Updated 'last' rectangle:{0}", last_rectangle); } else { rectangles.Add(rectangle); last_rectangle = rectangle; } last_region = region; } // DEBUG CODE: Draw in the region rectangles // // When we run in NOKILL mode, we "know" we're running in a debugger or stand-alone environment // intended for testing this code. Hence we should dump the regions image as part of the process. if (no_kill) { string bitmap_diag_path = pdf_filename + @"." + page_number + @"-ocr.png"; Logging.Info("Dumping regions-augmented page {0} PNG image to file {1}", page_number, bitmap_diag_path); Graphics g = Graphics.FromImage(bitmap); foreach (Rectangle rectangle in rectangles) { if (rectangle.Width <= MIN_WIDTH && rectangle.Height > MIN_WIDTH) { DrawRectangleOutline(g, Pens.Purple, rectangle); } else if (rectangle.Width > MIN_WIDTH && rectangle.Height <= MIN_WIDTH) { DrawRectangleOutline(g, Pens.PowderBlue, rectangle); } else if (rectangle.Width <= MIN_WIDTH && rectangle.Height <= MIN_WIDTH) { DrawRectangleOutline(g, Pens.Red, rectangle); } else { DrawRectangleOutline(g, Pens.LawnGreen, rectangle); } } bitmap.Save(bitmap_diag_path, ImageFormat.Png); } // Do the OCR on each of the rectangles WordList word_list = new WordList(); foreach (Rectangle rectangle in rectangles) { if (0 == rectangle.Width || 0 == rectangle.Height) { Logging.Info("Skipping zero extent rectangle {0}", rectangle.ToString()); continue; } Logging.Info("Doing OCR for region {0} on bitmap WxH: {1}x{2}", rectangle.ToString(), bitmap.Width, bitmap.Height); List <Word> result = ocr.DoOCR(bitmap, rectangle); Logging.Info("Got {0} words", result.Count); word_list.AddRange(ConvertToWordList(result, rectangle, bitmap)); } Logging.Info("-Doing OCR"); Logging.Info("Found {0} words ({1} @ #{2})", word_list.Count, pdf_filename, page_number); #if false Logging.Info("+Reordering words for columns"); WordList word_list_ordered = ColumnWordOrderer.ReorderWords(word_list); Logging.Info("-Reordering words for columns"); word_list_ordered.WriteToFile(ocr_output_filename); #endif return(word_list); } } }
public static WordList DoOCR(string pdf_filename, int page_number) { Logging.Info("+Rendering page"); SoraxPDFRenderer renderer = new SoraxPDFRenderer(pdf_filename, pdf_user_password, pdf_user_password); Bitmap bitmap = (Bitmap)Image.FromStream(new MemoryStream(renderer.GetPageByDPIAsImage(page_number, 200))); Logging.Info("-Rendering page"); Logging.Info("Startup directory is {0}", Environment.CurrentDirectory); Logging.Info("Language is '{0}'", language); Tesseract ocr = new Tesseract(); ocr.Init(null, language, false); Logging.Info("+Doing OCR"); // Build a list of all the rectangles to process PDFRegionLocator pdf_region_locator = new PDFRegionLocator(bitmap); PDFRegionLocator.Region last_region = pdf_region_locator.regions[0]; List <Rectangle> rectangles = new List <Rectangle>(); foreach (PDFRegionLocator.Region region in pdf_region_locator.regions) { if (false) { } else if (last_region.state == PDFRegionLocator.SegmentState.BLANKS) { // LHS { Rectangle rectangle = new Rectangle(0, last_region.y, bitmap.Width / 2, region.y - last_region.y); rectangles.Add(rectangle); } // RHS { Rectangle rectangle = new Rectangle(bitmap.Width / 2, last_region.y, bitmap.Width / 2, region.y - last_region.y); rectangles.Add(rectangle); } } else if (last_region.state == PDFRegionLocator.SegmentState.PIXELS) { // Full column { Rectangle rectangle = new Rectangle(0, last_region.y, bitmap.Width, region.y - last_region.y); rectangles.Add(rectangle); } } last_region = region; } // DEBUG CODE: Draw in the region rectangles //{ // Graphics g = Graphics.FromImage(bitmap); // foreach (Rectangle rectangle in rectangles) // { // g.DrawRectangle(Pens.Black, rectangle); // } // bitmap.Save(@"C:\temp\aaaaaa.png", ImageFormat.Png); //} // Do the OCR on each of the rectangles WordList word_list = new WordList(); foreach (Rectangle rectangle in rectangles) { if (0 == rectangle.Width || 0 == rectangle.Height) { Logging.Info("Skipping zero extent rectangle {0}", rectangle.ToString()); continue; } Logging.Info("Doing OCR for region {0}", rectangle.ToString()); List <Word> result = ocr.DoOCR(bitmap, rectangle); Logging.Info("Got {0} words", result.Count); word_list.AddRange(ConvertToWordList(result, rectangle, bitmap)); } Logging.Info("-Doing OCR"); Logging.Info("Found {0} words", word_list.Count); //Logging.Info("+Reordering words for columns"); //WordList word_list_ordered = ColumnWordOrderer.ReorderWords(word_list); //Logging.Info("-Reordering words for columns"); //word_list_ordered.WriteToFile(ocr_output_filename); return(word_list); }