public static WordList DoOCR(string pdf_filename, int page_number) { Logging.Info("+Rendering page {1} for PDF file {0}", pdf_filename, page_number); // Do the OCR on each of the rectangles WordList word_list = new WordList(); word_list.AddRange(ConvertToWordList()); Logging.Info("-Doing OCR"); Logging.Info("Faked {0} words ({1} @ #{2})", word_list.Count, pdf_filename, page_number); #if false Logging.Info("+Reordering words for columns"); WordList word_list_ordered = ColumnWordOrderer.ReorderWords(word_list); Logging.Info("-Reordering words for columns"); word_list_ordered.WriteToFile(ocr_output_filename); #endif return(word_list); }
public static List <PDFSearchResult> SearchPage(PDFDocument pdf_document, int page, string terms, MatchDelegate match) { // Tidy up the keywords if (null == terms) { terms = ""; } string[] keywords = GenerateIndividualKeywords(terms); List <PDFSearchResult> search_results = new List <PDFSearchResult>(); WordList words = new WordList(); var SPLITTER_WHITESPACE = new char[] { ' ', '\n', '\r', '\t' }; // Add the comments { if (1 == page && !String.IsNullOrEmpty(pdf_document.Comments)) { var splits = pdf_document.Comments.Split(SPLITTER_WHITESPACE, StringSplitOptions.RemoveEmptyEntries); foreach (var split in splits) { words.Add(new Word { Text = split }); } } } // Add the annotations { foreach (var pdf_annotation in pdf_document.GetAnnotations()) { if (page == pdf_annotation.Page) { if (!String.IsNullOrEmpty(pdf_annotation.Text)) { var splits = pdf_annotation.Text.Split(SPLITTER_WHITESPACE, StringSplitOptions.RemoveEmptyEntries); foreach (var split in splits) { words.Add(new Word { Text = split }); } } } } } // Add the PDF running text { WordList words_pdf = pdf_document.GetOCRText(page); if (null != words_pdf) { words.AddRange(words_pdf); } } // Find the text if (null != words && null != keywords) { // Split keywords string[][] split_keywords = new string[keywords.Length][]; for (int i = 0; i < keywords.Length; ++i) { split_keywords[i] = StringTools.Split_NotInDelims(keywords[i].ToLower(), '"', '"', " ").ToArray(); } for (int w = 0; w < words.Count; ++w) { Word first_word = words[w]; string first_word_lower = first_word.Text.ToLower(); for (int i = 0; i < split_keywords.Length; ++i) { string[] split_keyword = split_keywords[i]; // Ignore spurious empty keyword sets if (1 > split_keyword.Length) { continue; } // Don't process single keywords that are too short if (2 > split_keyword[0].Length) { continue; } // Process the first word - if it doesn't match we are done here if (!match(first_word_lower, split_keyword[0])) { continue; } // If there are more words we have to get a little crafty and check the remaining words bool follows_match = true; for (int j = 0; j < split_keyword.Length; ++j) { if (w + j < words.Count) { Word follow_word = words[w + j]; string follow_word_lower = follow_word.Text.ToLower(); if (!match(follow_word_lower, split_keyword[j])) { follows_match = false; break; } } else { follows_match = false; break; } } // If the remaining words dont match, bail if (!follows_match) { continue; } // If we get here, the word (any any follow words) match { PDFSearchResult search_result = new PDFSearchResult(); search_results.Add(search_result); // Store the page search_result.keywords = keywords; search_result.page = page; // Get the words associated with this result { search_result.keyword_index = i; search_result.words = new Word[split_keyword.Length]; for (int j = 0; j < split_keyword.Length; ++j) { Word follow_word = words[w + j]; search_result.words[j] = follow_word; } } // Create the context sentence { int MIN_CONTEXT_SIZE = 3; int MAX_CONTEXT_SIZE = 10; bool ellipsis_start = false; bool ellipsis_end = false; int w_start = w; while (w_start > 0) { // Stop at a preceding sentence if (ContainsASentenceTerminator(words[w_start - 1].Text)) { if (w - w_start >= MIN_CONTEXT_SIZE) { break; } } // Stop if we are going too far if (w - w_start > MAX_CONTEXT_SIZE) { ellipsis_start = true; break; } --w_start; } int w_end = w; while (w_end < words.Count) { // Stop at the end of a sentence if (ContainsASentenceTerminator(words[w_end].Text)) { if (w_end - w >= MIN_CONTEXT_SIZE) { break; } } // Stop if we are going too far if (w_end - w > MAX_CONTEXT_SIZE) { ellipsis_end = true; break; } if (w_end + 1 == words.Count) { break; } ++w_end; } StringBuilder sb = new StringBuilder(); sb.AppendFormat("p{0}: ", page); if (ellipsis_start) { sb.Append("..."); } for (int w_current = w_start; w_current <= w_end; ++w_current) { sb.Append(words[w_current].Text); sb.Append(" "); } if (ellipsis_end) { sb.Append("..."); } search_result.context_sentence = sb.ToString(); } } } } } return(search_results); }
public static WordList DoOCR(string pdf_filename, int page_number) { Logging.Info("+Rendering page {1} for PDF file {0}", pdf_filename, page_number); using (MemoryStream ms = new MemoryStream(SoraxPDFRenderer.GetPageByDPIAsImage(pdf_filename, pdf_user_password, page_number, 200))) { Bitmap bitmap = (Bitmap)Image.FromStream(ms); Logging.Info("-Rendering page #{0}", page_number); Logging.Info("Startup directory is {0}", Environment.CurrentDirectory); Logging.Info("Language is '{0}'", language); using (Tesseract ocr = new Tesseract()) { ocr.Init(null, language, false); Logging.Info("+Doing OCR"); const int MIN_WIDTH = 0; // Build a list of all the rectangles to process PDFRegionLocator pdf_region_locator = new PDFRegionLocator(bitmap); PDFRegionLocator.Region last_region = pdf_region_locator.regions[0]; List <Rectangle> rectangles = new List <Rectangle>(); Rectangle last_rectangle = new Rectangle(); foreach (PDFRegionLocator.Region region in pdf_region_locator.regions) { int rect_height = region.y - last_region.y; bool alarming_height = (rect_height <= 0); Rectangle rectangle = new Rectangle(); if (last_region.state == PDFRegionLocator.SegmentState.BLANKS) { // LHS { rectangle = new Rectangle(0, last_region.y, bitmap.Width / 2, Math.Max(MIN_WIDTH, rect_height)); } // RHS { rectangle = new Rectangle(bitmap.Width / 2, last_region.y, bitmap.Width / 2, Math.Max(MIN_WIDTH, rect_height)); } } else if (last_region.state == PDFRegionLocator.SegmentState.PIXELS) { // Full column { rectangle = new Rectangle(0, last_region.y, bitmap.Width, Math.Max(MIN_WIDTH, rect_height)); } } if (alarming_height || rectangle.Height <= 0) { Logging.Warn("Calculated region height is negative or zero: {0} :: Calculated region {1} <-- CURRENT:{2} - LAST:{3}", rect_height, rectangle, region, last_region); // skip rectangle } else if (last_rectangle.X == rectangle.X && last_rectangle.Y == rectangle.Y) { Logging.Warn("Overlapping subsequent rectangles will be merged :: CURRENT:{0} - LAST:{1}", rectangle, last_rectangle); last_rectangle.Width = Math.Max(last_rectangle.Width, rectangle.Width); last_rectangle.Height = Math.Max(last_rectangle.Height, rectangle.Height); Logging.Warn("--> Updated 'last' rectangle:{0}", last_rectangle); } else { rectangles.Add(rectangle); last_rectangle = rectangle; } last_region = region; } // DEBUG CODE: Draw in the region rectangles // // When we run in NOKILL mode, we "know" we're running in a debugger or stand-alone environment // intended for testing this code. Hence we should dump the regions image as part of the process. if (no_kill) { string bitmap_diag_path = pdf_filename + @"." + page_number + @"-ocr.png"; Logging.Info("Dumping regions-augmented page {0} PNG image to file {1}", page_number, bitmap_diag_path); Graphics g = Graphics.FromImage(bitmap); foreach (Rectangle rectangle in rectangles) { if (rectangle.Width <= MIN_WIDTH && rectangle.Height > MIN_WIDTH) { DrawRectangleOutline(g, Pens.Purple, rectangle); } else if (rectangle.Width > MIN_WIDTH && rectangle.Height <= MIN_WIDTH) { DrawRectangleOutline(g, Pens.PowderBlue, rectangle); } else if (rectangle.Width <= MIN_WIDTH && rectangle.Height <= MIN_WIDTH) { DrawRectangleOutline(g, Pens.Red, rectangle); } else { DrawRectangleOutline(g, Pens.LawnGreen, rectangle); } } bitmap.Save(bitmap_diag_path, ImageFormat.Png); } // Do the OCR on each of the rectangles WordList word_list = new WordList(); foreach (Rectangle rectangle in rectangles) { if (0 == rectangle.Width || 0 == rectangle.Height) { Logging.Info("Skipping zero extent rectangle {0}", rectangle.ToString()); continue; } Logging.Info("Doing OCR for region {0} on bitmap WxH: {1}x{2}", rectangle.ToString(), bitmap.Width, bitmap.Height); List <Word> result = ocr.DoOCR(bitmap, rectangle); Logging.Info("Got {0} words", result.Count); word_list.AddRange(ConvertToWordList(result, rectangle, bitmap)); } Logging.Info("-Doing OCR"); Logging.Info("Found {0} words ({1} @ #{2})", word_list.Count, pdf_filename, page_number); #if false Logging.Info("+Reordering words for columns"); WordList word_list_ordered = ColumnWordOrderer.ReorderWords(word_list); Logging.Info("-Reordering words for columns"); word_list_ordered.WriteToFile(ocr_output_filename); #endif return(word_list); } } }
public static WordList DoOCR(string pdf_filename, int page_number) { Logging.Info("+Rendering page"); SoraxPDFRenderer renderer = new SoraxPDFRenderer(pdf_filename, pdf_user_password, pdf_user_password); Bitmap bitmap = (Bitmap)Image.FromStream(new MemoryStream(renderer.GetPageByDPIAsImage(page_number, 200))); Logging.Info("-Rendering page"); Logging.Info("Startup directory is {0}", Environment.CurrentDirectory); Logging.Info("Language is '{0}'", language); Tesseract ocr = new Tesseract(); ocr.Init(null, language, false); Logging.Info("+Doing OCR"); // Build a list of all the rectangles to process PDFRegionLocator pdf_region_locator = new PDFRegionLocator(bitmap); PDFRegionLocator.Region last_region = pdf_region_locator.regions[0]; List <Rectangle> rectangles = new List <Rectangle>(); foreach (PDFRegionLocator.Region region in pdf_region_locator.regions) { if (false) { } else if (last_region.state == PDFRegionLocator.SegmentState.BLANKS) { // LHS { Rectangle rectangle = new Rectangle(0, last_region.y, bitmap.Width / 2, region.y - last_region.y); rectangles.Add(rectangle); } // RHS { Rectangle rectangle = new Rectangle(bitmap.Width / 2, last_region.y, bitmap.Width / 2, region.y - last_region.y); rectangles.Add(rectangle); } } else if (last_region.state == PDFRegionLocator.SegmentState.PIXELS) { // Full column { Rectangle rectangle = new Rectangle(0, last_region.y, bitmap.Width, region.y - last_region.y); rectangles.Add(rectangle); } } last_region = region; } // DEBUG CODE: Draw in the region rectangles //{ // Graphics g = Graphics.FromImage(bitmap); // foreach (Rectangle rectangle in rectangles) // { // g.DrawRectangle(Pens.Black, rectangle); // } // bitmap.Save(@"C:\temp\aaaaaa.png", ImageFormat.Png); //} // Do the OCR on each of the rectangles WordList word_list = new WordList(); foreach (Rectangle rectangle in rectangles) { if (0 == rectangle.Width || 0 == rectangle.Height) { Logging.Info("Skipping zero extent rectangle {0}", rectangle.ToString()); continue; } Logging.Info("Doing OCR for region {0}", rectangle.ToString()); List <Word> result = ocr.DoOCR(bitmap, rectangle); Logging.Info("Got {0} words", result.Count); word_list.AddRange(ConvertToWordList(result, rectangle, bitmap)); } Logging.Info("-Doing OCR"); Logging.Info("Found {0} words", word_list.Count); //Logging.Info("+Reordering words for columns"); //WordList word_list_ordered = ColumnWordOrderer.ReorderWords(word_list); //Logging.Info("-Reordering words for columns"); //word_list_ordered.WriteToFile(ocr_output_filename); return(word_list); }