Ejemplo n.º 1
0
        public static WordList DoOCR(string pdf_filename, int page_number)
        {
            Logging.Info("+Rendering page {1} for PDF file {0}", pdf_filename, page_number);

            // Do the OCR on each of the rectangles
            WordList word_list = new WordList();

            word_list.AddRange(ConvertToWordList());

            Logging.Info("-Doing OCR");

            Logging.Info("Faked {0} words ({1} @ #{2})", word_list.Count, pdf_filename, page_number);

#if false
            Logging.Info("+Reordering words for columns");
            WordList word_list_ordered = ColumnWordOrderer.ReorderWords(word_list);
            Logging.Info("-Reordering words for columns");
            word_list_ordered.WriteToFile(ocr_output_filename);
#endif

            return(word_list);
        }
Ejemplo n.º 2
0
        public static List <PDFSearchResult> SearchPage(PDFDocument pdf_document, int page, string terms, MatchDelegate match)
        {
            // Tidy up the keywords
            if (null == terms)
            {
                terms = "";
            }

            string[] keywords = GenerateIndividualKeywords(terms);

            List <PDFSearchResult> search_results = new List <PDFSearchResult>();
            WordList words = new WordList();
            var      SPLITTER_WHITESPACE = new char[] { ' ', '\n', '\r', '\t' };

            // Add the comments
            {
                if (1 == page && !String.IsNullOrEmpty(pdf_document.Comments))
                {
                    var splits = pdf_document.Comments.Split(SPLITTER_WHITESPACE, StringSplitOptions.RemoveEmptyEntries);
                    foreach (var split in splits)
                    {
                        words.Add(new Word {
                            Text = split
                        });
                    }
                }
            }

            // Add the annotations
            {
                foreach (var pdf_annotation in pdf_document.GetAnnotations())
                {
                    if (page == pdf_annotation.Page)
                    {
                        if (!String.IsNullOrEmpty(pdf_annotation.Text))
                        {
                            var splits = pdf_annotation.Text.Split(SPLITTER_WHITESPACE, StringSplitOptions.RemoveEmptyEntries);
                            foreach (var split in splits)
                            {
                                words.Add(new Word {
                                    Text = split
                                });
                            }
                        }
                    }
                }
            }

            // Add the PDF running text
            {
                WordList words_pdf = pdf_document.GetOCRText(page);
                if (null != words_pdf)
                {
                    words.AddRange(words_pdf);
                }
            }

            // Find the text
            if (null != words && null != keywords)
            {
                // Split keywords
                string[][] split_keywords = new string[keywords.Length][];
                for (int i = 0; i < keywords.Length; ++i)
                {
                    split_keywords[i] = StringTools.Split_NotInDelims(keywords[i].ToLower(), '"', '"', " ").ToArray();
                }

                for (int w = 0; w < words.Count; ++w)
                {
                    Word   first_word       = words[w];
                    string first_word_lower = first_word.Text.ToLower();

                    for (int i = 0; i < split_keywords.Length; ++i)
                    {
                        string[] split_keyword = split_keywords[i];

                        // Ignore spurious empty keyword sets
                        if (1 > split_keyword.Length)
                        {
                            continue;
                        }

                        // Don't process single keywords that are too short
                        if (2 > split_keyword[0].Length)
                        {
                            continue;
                        }

                        // Process the first word - if it doesn't match we are done here
                        if (!match(first_word_lower, split_keyword[0]))
                        {
                            continue;
                        }

                        // If there are more words we have to get a little crafty and check the remaining words
                        bool follows_match = true;
                        for (int j = 0; j < split_keyword.Length; ++j)
                        {
                            if (w + j < words.Count)
                            {
                                Word   follow_word       = words[w + j];
                                string follow_word_lower = follow_word.Text.ToLower();
                                if (!match(follow_word_lower, split_keyword[j]))
                                {
                                    follows_match = false;
                                    break;
                                }
                            }
                            else
                            {
                                follows_match = false;
                                break;
                            }
                        }

                        // If the remaining words dont match, bail
                        if (!follows_match)
                        {
                            continue;
                        }

                        // If we get here, the word (any any follow words) match
                        {
                            PDFSearchResult search_result = new PDFSearchResult();
                            search_results.Add(search_result);

                            // Store the page
                            search_result.keywords = keywords;
                            search_result.page     = page;

                            // Get the words associated with this result
                            {
                                search_result.keyword_index = i;
                                search_result.words         = new Word[split_keyword.Length];
                                for (int j = 0; j < split_keyword.Length; ++j)
                                {
                                    Word follow_word = words[w + j];
                                    search_result.words[j] = follow_word;
                                }
                            }

                            // Create the context sentence
                            {
                                int  MIN_CONTEXT_SIZE = 3;
                                int  MAX_CONTEXT_SIZE = 10;
                                bool ellipsis_start   = false;
                                bool ellipsis_end     = false;
                                int  w_start          = w;
                                while (w_start > 0)
                                {
                                    // Stop at a preceding sentence
                                    if (ContainsASentenceTerminator(words[w_start - 1].Text))
                                    {
                                        if (w - w_start >= MIN_CONTEXT_SIZE)
                                        {
                                            break;
                                        }
                                    }

                                    // Stop if we are going too far
                                    if (w - w_start > MAX_CONTEXT_SIZE)
                                    {
                                        ellipsis_start = true;
                                        break;
                                    }

                                    --w_start;
                                }
                                int w_end = w;
                                while (w_end < words.Count)
                                {
                                    // Stop at the end of a sentence
                                    if (ContainsASentenceTerminator(words[w_end].Text))
                                    {
                                        if (w_end - w >= MIN_CONTEXT_SIZE)
                                        {
                                            break;
                                        }
                                    }

                                    // Stop if we are going too far
                                    if (w_end - w > MAX_CONTEXT_SIZE)
                                    {
                                        ellipsis_end = true;
                                        break;
                                    }


                                    if (w_end + 1 == words.Count)
                                    {
                                        break;
                                    }

                                    ++w_end;
                                }

                                StringBuilder sb = new StringBuilder();
                                sb.AppendFormat("p{0}: ", page);
                                if (ellipsis_start)
                                {
                                    sb.Append("...");
                                }
                                for (int w_current = w_start; w_current <= w_end; ++w_current)
                                {
                                    sb.Append(words[w_current].Text);
                                    sb.Append(" ");
                                }
                                if (ellipsis_end)
                                {
                                    sb.Append("...");
                                }
                                search_result.context_sentence = sb.ToString();
                            }
                        }
                    }
                }
            }

            return(search_results);
        }
Ejemplo n.º 3
0
        public static WordList DoOCR(string pdf_filename, int page_number)
        {
            Logging.Info("+Rendering page {1} for PDF file {0}", pdf_filename, page_number);
            using (MemoryStream ms = new MemoryStream(SoraxPDFRenderer.GetPageByDPIAsImage(pdf_filename, pdf_user_password, page_number, 200)))
            {
                Bitmap bitmap = (Bitmap)Image.FromStream(ms);

                Logging.Info("-Rendering page #{0}", page_number);

                Logging.Info("Startup directory is {0}", Environment.CurrentDirectory);
                Logging.Info("Language is '{0}'", language);

                using (Tesseract ocr = new Tesseract())
                {
                    ocr.Init(null, language, false);

                    Logging.Info("+Doing OCR");

                    const int MIN_WIDTH = 0;

                    // Build a list of all the rectangles to process
                    PDFRegionLocator        pdf_region_locator = new PDFRegionLocator(bitmap);
                    PDFRegionLocator.Region last_region        = pdf_region_locator.regions[0];
                    List <Rectangle>        rectangles         = new List <Rectangle>();
                    Rectangle last_rectangle = new Rectangle();
                    foreach (PDFRegionLocator.Region region in pdf_region_locator.regions)
                    {
                        int  rect_height     = region.y - last_region.y;
                        bool alarming_height = (rect_height <= 0);

                        Rectangle rectangle = new Rectangle();

                        if (last_region.state == PDFRegionLocator.SegmentState.BLANKS)
                        {
                            // LHS
                            {
                                rectangle = new Rectangle(0, last_region.y, bitmap.Width / 2, Math.Max(MIN_WIDTH, rect_height));
                            }
                            // RHS
                            {
                                rectangle = new Rectangle(bitmap.Width / 2, last_region.y, bitmap.Width / 2, Math.Max(MIN_WIDTH, rect_height));
                            }
                        }
                        else if (last_region.state == PDFRegionLocator.SegmentState.PIXELS)
                        {
                            // Full column
                            {
                                rectangle = new Rectangle(0, last_region.y, bitmap.Width, Math.Max(MIN_WIDTH, rect_height));
                            }
                        }

                        if (alarming_height || rectangle.Height <= 0)
                        {
                            Logging.Warn("Calculated region height is negative or zero: {0} :: Calculated region {1} <-- CURRENT:{2} - LAST:{3}", rect_height, rectangle, region, last_region);

                            // skip rectangle
                        }
                        else if (last_rectangle.X == rectangle.X && last_rectangle.Y == rectangle.Y)
                        {
                            Logging.Warn("Overlapping subsequent rectangles will be merged :: CURRENT:{0} - LAST:{1}", rectangle, last_rectangle);
                            last_rectangle.Width  = Math.Max(last_rectangle.Width, rectangle.Width);
                            last_rectangle.Height = Math.Max(last_rectangle.Height, rectangle.Height);
                            Logging.Warn("--> Updated 'last' rectangle:{0}", last_rectangle);
                        }
                        else
                        {
                            rectangles.Add(rectangle);
                            last_rectangle = rectangle;
                        }

                        last_region = region;
                    }

                    // DEBUG CODE: Draw in the region rectangles
                    //
                    // When we run in NOKILL mode, we "know" we're running in a debugger or stand-alone environment
                    // intended for testing this code. Hence we should dump the regions image as part of the process.
                    if (no_kill)
                    {
                        string bitmap_diag_path = pdf_filename + @"." + page_number + @"-ocr.png";

                        Logging.Info("Dumping regions-augmented page {0} PNG image to file {1}", page_number, bitmap_diag_path);
                        Graphics g = Graphics.FromImage(bitmap);
                        foreach (Rectangle rectangle in rectangles)
                        {
                            if (rectangle.Width <= MIN_WIDTH && rectangle.Height > MIN_WIDTH)
                            {
                                DrawRectangleOutline(g, Pens.Purple, rectangle);
                            }
                            else if (rectangle.Width > MIN_WIDTH && rectangle.Height <= MIN_WIDTH)
                            {
                                DrawRectangleOutline(g, Pens.PowderBlue, rectangle);
                            }
                            else if (rectangle.Width <= MIN_WIDTH && rectangle.Height <= MIN_WIDTH)
                            {
                                DrawRectangleOutline(g, Pens.Red, rectangle);
                            }
                            else
                            {
                                DrawRectangleOutline(g, Pens.LawnGreen, rectangle);
                            }
                        }

                        bitmap.Save(bitmap_diag_path, ImageFormat.Png);
                    }

                    // Do the OCR on each of the rectangles
                    WordList word_list = new WordList();
                    foreach (Rectangle rectangle in rectangles)
                    {
                        if (0 == rectangle.Width || 0 == rectangle.Height)
                        {
                            Logging.Info("Skipping zero extent rectangle {0}", rectangle.ToString());
                            continue;
                        }

                        Logging.Info("Doing OCR for region {0} on bitmap WxH: {1}x{2}", rectangle.ToString(), bitmap.Width, bitmap.Height);
                        List <Word> result = ocr.DoOCR(bitmap, rectangle);
                        Logging.Info("Got {0} words", result.Count);
                        word_list.AddRange(ConvertToWordList(result, rectangle, bitmap));
                    }

                    Logging.Info("-Doing OCR");


                    Logging.Info("Found {0} words ({1} @ #{2})", word_list.Count, pdf_filename, page_number);

#if false
                    Logging.Info("+Reordering words for columns");
                    WordList word_list_ordered = ColumnWordOrderer.ReorderWords(word_list);
                    Logging.Info("-Reordering words for columns");
                    word_list_ordered.WriteToFile(ocr_output_filename);
#endif

                    return(word_list);
                }
            }
        }
Ejemplo n.º 4
0
        public static WordList DoOCR(string pdf_filename, int page_number)
        {
            Logging.Info("+Rendering page");
            SoraxPDFRenderer renderer = new SoraxPDFRenderer(pdf_filename, pdf_user_password, pdf_user_password);
            Bitmap           bitmap   = (Bitmap)Image.FromStream(new MemoryStream(renderer.GetPageByDPIAsImage(page_number, 200)));

            Logging.Info("-Rendering page");

            Logging.Info("Startup directory is {0}", Environment.CurrentDirectory);
            Logging.Info("Language is '{0}'", language);

            Tesseract ocr = new Tesseract();

            ocr.Init(null, language, false);

            Logging.Info("+Doing OCR");

            // Build a list of all the rectangles to process
            PDFRegionLocator pdf_region_locator = new PDFRegionLocator(bitmap);

            PDFRegionLocator.Region last_region = pdf_region_locator.regions[0];
            List <Rectangle>        rectangles  = new List <Rectangle>();

            foreach (PDFRegionLocator.Region region in pdf_region_locator.regions)
            {
                if (false)
                {
                }
                else if (last_region.state == PDFRegionLocator.SegmentState.BLANKS)
                {
                    // LHS
                    {
                        Rectangle rectangle = new Rectangle(0, last_region.y, bitmap.Width / 2, region.y - last_region.y);
                        rectangles.Add(rectangle);
                    }
                    // RHS
                    {
                        Rectangle rectangle = new Rectangle(bitmap.Width / 2, last_region.y, bitmap.Width / 2, region.y - last_region.y);
                        rectangles.Add(rectangle);
                    }
                }
                else if (last_region.state == PDFRegionLocator.SegmentState.PIXELS)
                {
                    // Full column
                    {
                        Rectangle rectangle = new Rectangle(0, last_region.y, bitmap.Width, region.y - last_region.y);
                        rectangles.Add(rectangle);
                    }
                }

                last_region = region;
            }

            // DEBUG CODE: Draw in the region rectangles
            //{
            //    Graphics g = Graphics.FromImage(bitmap);
            //    foreach (Rectangle rectangle in rectangles)
            //    {
            //        g.DrawRectangle(Pens.Black, rectangle);
            //    }

            //    bitmap.Save(@"C:\temp\aaaaaa.png", ImageFormat.Png);
            //}

            // Do the OCR on each of the rectangles
            WordList word_list = new WordList();

            foreach (Rectangle rectangle in rectangles)
            {
                if (0 == rectangle.Width || 0 == rectangle.Height)
                {
                    Logging.Info("Skipping zero extent rectangle {0}", rectangle.ToString());
                    continue;
                }

                Logging.Info("Doing OCR for region {0}", rectangle.ToString());
                List <Word> result = ocr.DoOCR(bitmap, rectangle);
                Logging.Info("Got {0} words", result.Count);
                word_list.AddRange(ConvertToWordList(result, rectangle, bitmap));
            }

            Logging.Info("-Doing OCR");


            Logging.Info("Found {0} words", word_list.Count);

            //Logging.Info("+Reordering words for columns");
            //WordList word_list_ordered = ColumnWordOrderer.ReorderWords(word_list);
            //Logging.Info("-Reordering words for columns");
            //word_list_ordered.WriteToFile(ocr_output_filename);

            return(word_list);
        }