public static List <PDFPage> GetPDFPages(Stream pdfStream, TraceWriter log, bool ocrImages = false) { var result = new List <PDFPage>(); pdfStream.Position = 0; // Ensure that we are at the start // Note: PdfReader Dispose closes the stream... using (PdfReader reader = new PdfReader(pdfStream)) { var numberOfPages = reader.NumberOfPages; var parser = new PdfReaderContentParser(reader); ImageRenderListener listener = null; // = new ImageRenderListener(log); for (var i = 1; i <= reader.NumberOfPages; i++) { var page = new PDFPage { Number = i }; try { parser.ProcessContent(i, (listener = new ImageRenderListener(log))); } catch (Exception ex) { log.Error(string.Format("Page {0} Image Processing Exception", i), ex); } if (listener.Images.Count > 0) { log.Info(string.Format("Found {0} images on page {1}.", listener.Images.Count, i)); page.ExtractedImages = listener.Images; if (ocrImages) { if (listener.Images.Count < 10) { log.Info("Calling Vision API to OCR Page Images"); VisionAPIHelper.OCRPage(page, log); } else { log.Info("Too many Page Images for Vision API"); } } } try { page.PageText = PdfTextExtractor.GetTextFromPage(reader, i, new SimpleTextExtractionStrategy()); } catch (System.ArgumentException ex) { log.Error(string.Format("Page {0} Text Processing Exception", i), ex); } result.Add(page); } } return(result); }
public static void OCRPage(PDFPage thisPage, TraceWriter log) { foreach (var img in thisPage.ExtractedImages) { // Image must be Bitmap if (!(img is Bitmap)) { log.Info("Found image that was not Bitmap - ignoring"); continue; } // Image must be at least 40 x 40 if (img.Width < 40 || img.Height < 40) { log.Info("Image is too small - ignoring"); continue; } // Convert this image to grayscale (must be Bitmap as we discovered earlier) var bmp = (Bitmap)img; // In case Grayscale fails - just use it as-is if (img.PixelFormat != PixelFormat.Format1bppIndexed && img.PixelFormat != PixelFormat.Format8bppIndexed) { try { bmp = Grayscale.CommonAlgorithms.BT709.Apply((Bitmap)img); } catch (Exception) { } } // Image can't be larger than 3200 x 3200 if (bmp.Width > 3200 || bmp.Height > 3200) { log.Info(string.Format("Image is too big {0} x {1} - dealing with that...", bmp.Width, bmp.Height)); if (bmp.Width < 3500 && bmp.Height < 3500) { // Lets crop it - hopefully there are margins log.Info("Cropping the image"); // Calculate Crop Rectangle var rect = new System.Drawing.Rectangle(0, 0, bmp.Width, bmp.Height); if (bmp.Width > 3200) { var halfWideCrop = (bmp.Width - 3200) / 2; rect.X = halfWideCrop; rect.Width = bmp.Width - halfWideCrop; } if (bmp.Height > 3200) { var halfHighCrop = (bmp.Width - 3200) / 2; rect.Y = halfHighCrop; rect.Height = bmp.Height - halfHighCrop; } Crop cropFilter = new Crop(rect); bmp = cropFilter.Apply(bmp); } else { // Scale the image down log.Info("Resizing the image"); // Calculate the shrinkage int newWidth, newHeight; if (bmp.Width > bmp.Height) { var scaleFactor = bmp.Width / 3200F; newWidth = Convert.ToInt32(bmp.Width / scaleFactor); newHeight = Convert.ToInt32(bmp.Height / scaleFactor); } else { var scaleFactor = bmp.Height / 3200F; newWidth = Convert.ToInt32(bmp.Width / scaleFactor); newHeight = Convert.ToInt32(bmp.Height / scaleFactor); } if (newWidth > 3200 || newHeight > 3200) { throw new ApplicationException("DOH! Miscalculated Scale"); } try { ResizeBicubic resizeFilter = new ResizeBicubic(newWidth, newHeight); bmp = resizeFilter.Apply(bmp); } catch (AForge.Imaging.UnsupportedImageFormatException) { return; } } } var ms = new MemoryStream(); bmp.Save(ms, System.Drawing.Imaging.ImageFormat.Jpeg); ms.Position = 0; try { Task <string> recoTask = Task.Run <string>(async() => { var result = await UploadAndRecognizeImageTextAsync(ms, "en"); return(result); }); recoTask.Wait(); var ocrResult = recoTask.Result; thisPage.OCRText += ocrResult.Trim() + "\r\n"; } catch (Exception ex) { log.Warning(string.Format("Page {0} OCR Exception: {1}", thisPage.Number, ex.Message)); } } }