public static WordList DoOCR(string pdf_filename, int page_number) { Logging.Info("+Rendering page {1} for PDF file {0}", pdf_filename, page_number); // Do the OCR on each of the rectangles WordList word_list = new WordList(); word_list.AddRange(ConvertToWordList()); Logging.Info("-Doing OCR"); Logging.Info("Faked {0} words ({1} @ #{2})", word_list.Count, pdf_filename, page_number); #if false Logging.Info("+Reordering words for columns"); WordList word_list_ordered = ColumnWordOrderer.ReorderWords(word_list); Logging.Info("-Reordering words for columns"); word_list_ordered.WriteToFile(ocr_output_filename); #endif return(word_list); }
public static ExtractionResult ExtractText(PDFRenderer pdf_renderer) { Logging.Info("Doing text extraction for {0}", pdf_renderer.ToString()); try { int page_count = pdf_renderer.PageCount; PageDetail[] page_details = new PageDetail[page_count]; for (int i = 0; i < page_count; ++i) { page_details[i] = new PageDetail(i + 1); } bool all_pages_already_with_ocr = true; foreach (PageDetail page_detail in page_details) { WordList word_list = pdf_renderer.GetOCRText(page_detail.page); if (null != word_list) { Logging.Debug特("Page {0} has OCR available ({1})", page_detail.page, pdf_renderer.DocumentFingerprint); } else { Logging.Debug特("Page {0} has not had OCR done ({1})", page_detail.page, pdf_renderer.DocumentFingerprint); all_pages_already_with_ocr = false; } } if (!all_pages_already_with_ocr) { Logging.Info("Not all pages are ready with OCR"); return(new ExtractionResult(ExtractionResult.ResultType.OCR_NOT_COMPLETE)); } // All pages OCR are complete, so load the words lists foreach (PageDetail page_detail in page_details) { page_detail.word_list = pdf_renderer.GetOCRText(page_detail.page); } // Order the words on each page in a manner that makes sense of multiple columns List <Word> words_ordered = new List <Word>(); foreach (PageDetail page_detail in page_details) { WordList words_ordered_for_page = ColumnWordOrderer.ReorderWords(page_detail.word_list); words_ordered.AddRange(words_ordered_for_page); } // Concatenate the words List <string> words = new List <string>(); foreach (Word word in words_ordered) { words.Add(word.Text); } // Kill some of the line-wrapping hyphenation for (int i = words.Count - 2; i >= 0; --i) { if (words[i].EndsWith("-")) { words[i] = words[i].Substring(0, words[i].Length - 1) + words[i + 1]; words.RemoveAt(i + 1); } } // Return the words Logging.Info("Successfully extracted {0} words: {1}", words.Count, ArrayFormatter.listElements(words)); return(new ExtractionResult(ExtractionResult.ResultType.SUCCESS, words)); } catch (Exception ex) { Logging.Warn(ex, "There was an exception while extracting coherent text"); return(new ExtractionResult(ExtractionResult.ResultType.EXCEPTION, ex)); } }
public static WordList DoOCR(string pdf_filename, int page_number) { Logging.Info("+Rendering page {1} for PDF file {0}", pdf_filename, page_number); using (MemoryStream ms = new MemoryStream(SoraxPDFRenderer.GetPageByDPIAsImage(pdf_filename, pdf_user_password, page_number, 200))) { Bitmap bitmap = (Bitmap)Image.FromStream(ms); Logging.Info("-Rendering page #{0}", page_number); Logging.Info("Startup directory is {0}", Environment.CurrentDirectory); Logging.Info("Language is '{0}'", language); using (Tesseract ocr = new Tesseract()) { ocr.Init(null, language, false); Logging.Info("+Doing OCR"); const int MIN_WIDTH = 0; // Build a list of all the rectangles to process PDFRegionLocator pdf_region_locator = new PDFRegionLocator(bitmap); PDFRegionLocator.Region last_region = pdf_region_locator.regions[0]; List <Rectangle> rectangles = new List <Rectangle>(); Rectangle last_rectangle = new Rectangle(); foreach (PDFRegionLocator.Region region in pdf_region_locator.regions) { int rect_height = region.y - last_region.y; bool alarming_height = (rect_height <= 0); Rectangle rectangle = new Rectangle(); if (last_region.state == PDFRegionLocator.SegmentState.BLANKS) { // LHS { rectangle = new Rectangle(0, last_region.y, bitmap.Width / 2, Math.Max(MIN_WIDTH, rect_height)); } // RHS { rectangle = new Rectangle(bitmap.Width / 2, last_region.y, bitmap.Width / 2, Math.Max(MIN_WIDTH, rect_height)); } } else if (last_region.state == PDFRegionLocator.SegmentState.PIXELS) { // Full column { rectangle = new Rectangle(0, last_region.y, bitmap.Width, Math.Max(MIN_WIDTH, rect_height)); } } if (alarming_height || rectangle.Height <= 0) { Logging.Warn("Calculated region height is negative or zero: {0} :: Calculated region {1} <-- CURRENT:{2} - LAST:{3}", rect_height, rectangle, region, last_region); // skip rectangle } else if (last_rectangle.X == rectangle.X && last_rectangle.Y == rectangle.Y) { Logging.Warn("Overlapping subsequent rectangles will be merged :: CURRENT:{0} - LAST:{1}", rectangle, last_rectangle); last_rectangle.Width = Math.Max(last_rectangle.Width, rectangle.Width); last_rectangle.Height = Math.Max(last_rectangle.Height, rectangle.Height); Logging.Warn("--> Updated 'last' rectangle:{0}", last_rectangle); } else { rectangles.Add(rectangle); last_rectangle = rectangle; } last_region = region; } // DEBUG CODE: Draw in the region rectangles // // When we run in NOKILL mode, we "know" we're running in a debugger or stand-alone environment // intended for testing this code. Hence we should dump the regions image as part of the process. if (no_kill) { string bitmap_diag_path = pdf_filename + @"." + page_number + @"-ocr.png"; Logging.Info("Dumping regions-augmented page {0} PNG image to file {1}", page_number, bitmap_diag_path); Graphics g = Graphics.FromImage(bitmap); foreach (Rectangle rectangle in rectangles) { if (rectangle.Width <= MIN_WIDTH && rectangle.Height > MIN_WIDTH) { DrawRectangleOutline(g, Pens.Purple, rectangle); } else if (rectangle.Width > MIN_WIDTH && rectangle.Height <= MIN_WIDTH) { DrawRectangleOutline(g, Pens.PowderBlue, rectangle); } else if (rectangle.Width <= MIN_WIDTH && rectangle.Height <= MIN_WIDTH) { DrawRectangleOutline(g, Pens.Red, rectangle); } else { DrawRectangleOutline(g, Pens.LawnGreen, rectangle); } } bitmap.Save(bitmap_diag_path, ImageFormat.Png); } // Do the OCR on each of the rectangles WordList word_list = new WordList(); foreach (Rectangle rectangle in rectangles) { if (0 == rectangle.Width || 0 == rectangle.Height) { Logging.Info("Skipping zero extent rectangle {0}", rectangle.ToString()); continue; } Logging.Info("Doing OCR for region {0} on bitmap WxH: {1}x{2}", rectangle.ToString(), bitmap.Width, bitmap.Height); List <Word> result = ocr.DoOCR(bitmap, rectangle); Logging.Info("Got {0} words", result.Count); word_list.AddRange(ConvertToWordList(result, rectangle, bitmap)); } Logging.Info("-Doing OCR"); Logging.Info("Found {0} words ({1} @ #{2})", word_list.Count, pdf_filename, page_number); #if false Logging.Info("+Reordering words for columns"); WordList word_list_ordered = ColumnWordOrderer.ReorderWords(word_list); Logging.Info("-Reordering words for columns"); word_list_ordered.WriteToFile(ocr_output_filename); #endif return(word_list); } } }