Beispiel #1
0
        public static List <PDFPage> GetPDFPages(Stream pdfStream, TraceWriter log, bool ocrImages = false)
        {
            var result = new List <PDFPage>();

            pdfStream.Position = 0; // Ensure that we are at the start

            // Note: PdfReader Dispose closes the stream...
            using (PdfReader reader = new PdfReader(pdfStream))
            {
                var numberOfPages = reader.NumberOfPages;

                var parser = new PdfReaderContentParser(reader);
                ImageRenderListener listener = null; // = new ImageRenderListener(log);

                for (var i = 1; i <= reader.NumberOfPages; i++)
                {
                    var page = new PDFPage {
                        Number = i
                    };
                    try
                    {
                        parser.ProcessContent(i, (listener = new ImageRenderListener(log)));
                    }
                    catch (Exception ex)
                    {
                        log.Error(string.Format("Page {0} Image Processing Exception", i), ex);
                    }

                    if (listener.Images.Count > 0)
                    {
                        log.Info(string.Format("Found {0} images on page {1}.", listener.Images.Count, i));
                        page.ExtractedImages = listener.Images;
                        if (ocrImages)
                        {
                            if (listener.Images.Count < 10)
                            {
                                log.Info("Calling Vision API to OCR Page Images");
                                VisionAPIHelper.OCRPage(page, log);
                            }
                            else
                            {
                                log.Info("Too many Page Images for Vision API");
                            }
                        }
                    }
                    try
                    {
                        page.PageText = PdfTextExtractor.GetTextFromPage(reader, i, new SimpleTextExtractionStrategy());
                    }
                    catch (System.ArgumentException ex)
                    {
                        log.Error(string.Format("Page {0} Text Processing Exception", i), ex);
                    }

                    result.Add(page);
                }
            }
            return(result);
        }
        public static void OCRPage(PDFPage thisPage, TraceWriter log)
        {
            foreach (var img in thisPage.ExtractedImages)
            {
                // Image must be Bitmap
                if (!(img is Bitmap))
                {
                    log.Info("Found image that was not Bitmap - ignoring");
                    continue;
                }
                // Image must be at least 40 x 40
                if (img.Width < 40 || img.Height < 40)
                {
                    log.Info("Image is too small - ignoring");
                    continue;
                }

                // Convert this image to grayscale (must be Bitmap as we discovered earlier)
                var bmp = (Bitmap)img; // In case Grayscale fails - just use it as-is
                if (img.PixelFormat != PixelFormat.Format1bppIndexed &&
                    img.PixelFormat != PixelFormat.Format8bppIndexed)
                {
                    try
                    {
                        bmp = Grayscale.CommonAlgorithms.BT709.Apply((Bitmap)img);
                    }
                    catch (Exception) { }
                }

                // Image can't be larger than 3200 x 3200
                if (bmp.Width > 3200 || bmp.Height > 3200)
                {
                    log.Info(string.Format("Image is too big {0} x {1} - dealing with that...", bmp.Width, bmp.Height));
                    if (bmp.Width < 3500 && bmp.Height < 3500)
                    {
                        // Lets crop it - hopefully there are margins
                        log.Info("Cropping the image");
                        // Calculate Crop Rectangle
                        var rect = new System.Drawing.Rectangle(0, 0, bmp.Width, bmp.Height);
                        if (bmp.Width > 3200)
                        {
                            var halfWideCrop = (bmp.Width - 3200) / 2;
                            rect.X     = halfWideCrop;
                            rect.Width = bmp.Width - halfWideCrop;
                        }
                        if (bmp.Height > 3200)
                        {
                            var halfHighCrop = (bmp.Width - 3200) / 2;
                            rect.Y      = halfHighCrop;
                            rect.Height = bmp.Height - halfHighCrop;
                        }
                        Crop cropFilter = new Crop(rect);
                        bmp = cropFilter.Apply(bmp);
                    }
                    else
                    {
                        // Scale the image down
                        log.Info("Resizing the image");
                        // Calculate the shrinkage
                        int newWidth, newHeight;
                        if (bmp.Width > bmp.Height)
                        {
                            var scaleFactor = bmp.Width / 3200F;
                            newWidth  = Convert.ToInt32(bmp.Width / scaleFactor);
                            newHeight = Convert.ToInt32(bmp.Height / scaleFactor);
                        }
                        else
                        {
                            var scaleFactor = bmp.Height / 3200F;
                            newWidth  = Convert.ToInt32(bmp.Width / scaleFactor);
                            newHeight = Convert.ToInt32(bmp.Height / scaleFactor);
                        }
                        if (newWidth > 3200 || newHeight > 3200)
                        {
                            throw new ApplicationException("DOH! Miscalculated Scale");
                        }

                        try
                        {
                            ResizeBicubic resizeFilter = new ResizeBicubic(newWidth, newHeight);
                            bmp = resizeFilter.Apply(bmp);
                        }
                        catch (AForge.Imaging.UnsupportedImageFormatException)
                        {
                            return;
                        }
                    }
                }

                var ms = new MemoryStream();
                bmp.Save(ms, System.Drawing.Imaging.ImageFormat.Jpeg);
                ms.Position = 0;
                try
                {
                    Task <string> recoTask = Task.Run <string>(async() =>
                    {
                        var result = await UploadAndRecognizeImageTextAsync(ms, "en");
                        return(result);
                    });
                    recoTask.Wait();
                    var ocrResult = recoTask.Result;
                    thisPage.OCRText += ocrResult.Trim() + "\r\n";
                }
                catch (Exception ex)
                {
                    log.Warning(string.Format("Page {0} OCR Exception: {1}", thisPage.Number, ex.Message));
                }
            }
        }