Exemplo n.º 1
0
        public static WordList DoOCR(string pdf_filename, int page_number)
        {
            Logging.Info("+Rendering page {1} for PDF file {0}", pdf_filename, page_number);

            // Do the OCR on each of the rectangles
            WordList word_list = new WordList();

            word_list.AddRange(ConvertToWordList());

            Logging.Info("-Doing OCR");

            Logging.Info("Faked {0} words ({1} @ #{2})", word_list.Count, pdf_filename, page_number);

#if false
            Logging.Info("+Reordering words for columns");
            WordList word_list_ordered = ColumnWordOrderer.ReorderWords(word_list);
            Logging.Info("-Reordering words for columns");
            word_list_ordered.WriteToFile(ocr_output_filename);
#endif

            return(word_list);
        }
        public static ExtractionResult ExtractText(PDFRenderer pdf_renderer)
        {
            Logging.Info("Doing text extraction for {0}", pdf_renderer.ToString());

            try
            {
                int          page_count   = pdf_renderer.PageCount;
                PageDetail[] page_details = new PageDetail[page_count];

                for (int i = 0; i < page_count; ++i)
                {
                    page_details[i] = new PageDetail(i + 1);
                }

                bool all_pages_already_with_ocr = true;
                foreach (PageDetail page_detail in page_details)
                {
                    WordList word_list = pdf_renderer.GetOCRText(page_detail.page);

                    if (null != word_list)
                    {
                        Logging.Debug特("Page {0} has OCR available ({1})", page_detail.page, pdf_renderer.DocumentFingerprint);
                    }
                    else
                    {
                        Logging.Debug特("Page {0} has not had OCR done ({1})", page_detail.page, pdf_renderer.DocumentFingerprint);
                        all_pages_already_with_ocr = false;
                    }
                }

                if (!all_pages_already_with_ocr)
                {
                    Logging.Info("Not all pages are ready with OCR");
                    return(new ExtractionResult(ExtractionResult.ResultType.OCR_NOT_COMPLETE));
                }

                // All pages OCR are complete, so load the words lists
                foreach (PageDetail page_detail in page_details)
                {
                    page_detail.word_list = pdf_renderer.GetOCRText(page_detail.page);
                }

                // Order the words on each page in a manner that makes sense of multiple columns
                List <Word> words_ordered = new List <Word>();
                foreach (PageDetail page_detail in page_details)
                {
                    WordList words_ordered_for_page = ColumnWordOrderer.ReorderWords(page_detail.word_list);
                    words_ordered.AddRange(words_ordered_for_page);
                }

                // Concatenate the words
                List <string> words = new List <string>();
                foreach (Word word in words_ordered)
                {
                    words.Add(word.Text);
                }

                // Kill some of the line-wrapping hyphenation
                for (int i = words.Count - 2; i >= 0; --i)
                {
                    if (words[i].EndsWith("-"))
                    {
                        words[i] = words[i].Substring(0, words[i].Length - 1) + words[i + 1];
                        words.RemoveAt(i + 1);
                    }
                }

                // Return the words
                Logging.Info("Successfully extracted {0} words: {1}", words.Count, ArrayFormatter.listElements(words));
                return(new ExtractionResult(ExtractionResult.ResultType.SUCCESS, words));
            }
            catch (Exception ex)
            {
                Logging.Warn(ex, "There was an exception while extracting coherent text");
                return(new ExtractionResult(ExtractionResult.ResultType.EXCEPTION, ex));
            }
        }
Exemplo n.º 3
0
        public static WordList DoOCR(string pdf_filename, int page_number)
        {
            Logging.Info("+Rendering page {1} for PDF file {0}", pdf_filename, page_number);
            using (MemoryStream ms = new MemoryStream(SoraxPDFRenderer.GetPageByDPIAsImage(pdf_filename, pdf_user_password, page_number, 200)))
            {
                Bitmap bitmap = (Bitmap)Image.FromStream(ms);

                Logging.Info("-Rendering page #{0}", page_number);

                Logging.Info("Startup directory is {0}", Environment.CurrentDirectory);
                Logging.Info("Language is '{0}'", language);

                using (Tesseract ocr = new Tesseract())
                {
                    ocr.Init(null, language, false);

                    Logging.Info("+Doing OCR");

                    const int MIN_WIDTH = 0;

                    // Build a list of all the rectangles to process
                    PDFRegionLocator        pdf_region_locator = new PDFRegionLocator(bitmap);
                    PDFRegionLocator.Region last_region        = pdf_region_locator.regions[0];
                    List <Rectangle>        rectangles         = new List <Rectangle>();
                    Rectangle last_rectangle = new Rectangle();
                    foreach (PDFRegionLocator.Region region in pdf_region_locator.regions)
                    {
                        int  rect_height     = region.y - last_region.y;
                        bool alarming_height = (rect_height <= 0);

                        Rectangle rectangle = new Rectangle();

                        if (last_region.state == PDFRegionLocator.SegmentState.BLANKS)
                        {
                            // LHS
                            {
                                rectangle = new Rectangle(0, last_region.y, bitmap.Width / 2, Math.Max(MIN_WIDTH, rect_height));
                            }
                            // RHS
                            {
                                rectangle = new Rectangle(bitmap.Width / 2, last_region.y, bitmap.Width / 2, Math.Max(MIN_WIDTH, rect_height));
                            }
                        }
                        else if (last_region.state == PDFRegionLocator.SegmentState.PIXELS)
                        {
                            // Full column
                            {
                                rectangle = new Rectangle(0, last_region.y, bitmap.Width, Math.Max(MIN_WIDTH, rect_height));
                            }
                        }

                        if (alarming_height || rectangle.Height <= 0)
                        {
                            Logging.Warn("Calculated region height is negative or zero: {0} :: Calculated region {1} <-- CURRENT:{2} - LAST:{3}", rect_height, rectangle, region, last_region);

                            // skip rectangle
                        }
                        else if (last_rectangle.X == rectangle.X && last_rectangle.Y == rectangle.Y)
                        {
                            Logging.Warn("Overlapping subsequent rectangles will be merged :: CURRENT:{0} - LAST:{1}", rectangle, last_rectangle);
                            last_rectangle.Width  = Math.Max(last_rectangle.Width, rectangle.Width);
                            last_rectangle.Height = Math.Max(last_rectangle.Height, rectangle.Height);
                            Logging.Warn("--> Updated 'last' rectangle:{0}", last_rectangle);
                        }
                        else
                        {
                            rectangles.Add(rectangle);
                            last_rectangle = rectangle;
                        }

                        last_region = region;
                    }

                    // DEBUG CODE: Draw in the region rectangles
                    //
                    // When we run in NOKILL mode, we "know" we're running in a debugger or stand-alone environment
                    // intended for testing this code. Hence we should dump the regions image as part of the process.
                    if (no_kill)
                    {
                        string bitmap_diag_path = pdf_filename + @"." + page_number + @"-ocr.png";

                        Logging.Info("Dumping regions-augmented page {0} PNG image to file {1}", page_number, bitmap_diag_path);
                        Graphics g = Graphics.FromImage(bitmap);
                        foreach (Rectangle rectangle in rectangles)
                        {
                            if (rectangle.Width <= MIN_WIDTH && rectangle.Height > MIN_WIDTH)
                            {
                                DrawRectangleOutline(g, Pens.Purple, rectangle);
                            }
                            else if (rectangle.Width > MIN_WIDTH && rectangle.Height <= MIN_WIDTH)
                            {
                                DrawRectangleOutline(g, Pens.PowderBlue, rectangle);
                            }
                            else if (rectangle.Width <= MIN_WIDTH && rectangle.Height <= MIN_WIDTH)
                            {
                                DrawRectangleOutline(g, Pens.Red, rectangle);
                            }
                            else
                            {
                                DrawRectangleOutline(g, Pens.LawnGreen, rectangle);
                            }
                        }

                        bitmap.Save(bitmap_diag_path, ImageFormat.Png);
                    }

                    // Do the OCR on each of the rectangles
                    WordList word_list = new WordList();
                    foreach (Rectangle rectangle in rectangles)
                    {
                        if (0 == rectangle.Width || 0 == rectangle.Height)
                        {
                            Logging.Info("Skipping zero extent rectangle {0}", rectangle.ToString());
                            continue;
                        }

                        Logging.Info("Doing OCR for region {0} on bitmap WxH: {1}x{2}", rectangle.ToString(), bitmap.Width, bitmap.Height);
                        List <Word> result = ocr.DoOCR(bitmap, rectangle);
                        Logging.Info("Got {0} words", result.Count);
                        word_list.AddRange(ConvertToWordList(result, rectangle, bitmap));
                    }

                    Logging.Info("-Doing OCR");


                    Logging.Info("Found {0} words ({1} @ #{2})", word_list.Count, pdf_filename, page_number);

#if false
                    Logging.Info("+Reordering words for columns");
                    WordList word_list_ordered = ColumnWordOrderer.ReorderWords(word_list);
                    Logging.Info("-Reordering words for columns");
                    word_list_ordered.WriteToFile(ocr_output_filename);
#endif

                    return(word_list);
                }
            }
        }