// Extracts all images (of types that iTextSharp knows how to decode) from a PDF file
        public static List <Image> ExtractImages(Stream myBlob, string filename, TraceWriter log)
        {
            var images = new List <Image>();

            try
            {
                using (var reader = new PdfReader(myBlob))
                {
                    var parser   = new PdfReaderContentParser(reader);
                    var listener = new ImageRenderListener(log);

                    for (var i = 1; i <= reader.NumberOfPages; i++)
                    {
                        parser.ProcessContent(i, listener);
                        if (listener.Images.Count > 0)
                        {
                            log.Verbose($"Found {listener.Images.Count} images on page {i}.");
                            images.AddRange(listener.Images);
                            listener.Images.Clear();
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                log.Error($"Error: {ex.Message}");
            }
            return(images);
        }
        private static Dictionary <string, System.Drawing.Image> ExtractImages(byte[] fileBytes)
        {
            var images = new Dictionary <string, System.Drawing.Image>();

            using (var reader = new PdfReader(fileBytes))
            {
                var parser = new PdfReaderContentParser(reader);

                for (var i = 1; i <= reader.NumberOfPages; i++)
                {
                    ImageRenderListener listener = new ImageRenderListener();
                    parser.ProcessContent(i, listener);
                    var index = 1;

                    if (listener.Images.Count > 0)
                    {
                        foreach (var pair in listener.Images)
                        {
                            images.Add(string.Format("Page_{0}_Image_{1}{2}", i.ToString("D4"), index.ToString("D4"), pair.Value), pair.Key);
                            index++;
                        }
                    }
                }
                return(images);
            }
        }
Exemple #3
0
        public static List <PDFPage> GetPDFPages(Stream pdfStream, TraceWriter log, bool ocrImages = false)
        {
            var result = new List <PDFPage>();

            pdfStream.Position = 0; // Ensure that we are at the start

            // Note: PdfReader Dispose closes the stream...
            using (PdfReader reader = new PdfReader(pdfStream))
            {
                var numberOfPages = reader.NumberOfPages;

                var parser = new PdfReaderContentParser(reader);
                ImageRenderListener listener = null; // = new ImageRenderListener(log);

                for (var i = 1; i <= reader.NumberOfPages; i++)
                {
                    var page = new PDFPage {
                        Number = i
                    };
                    try
                    {
                        parser.ProcessContent(i, (listener = new ImageRenderListener(log)));
                    }
                    catch (Exception ex)
                    {
                        log.Error(string.Format("Page {0} Image Processing Exception", i), ex);
                    }

                    if (listener.Images.Count > 0)
                    {
                        log.Info(string.Format("Found {0} images on page {1}.", listener.Images.Count, i));
                        page.ExtractedImages = listener.Images;
                        if (ocrImages)
                        {
                            if (listener.Images.Count < 10)
                            {
                                log.Info("Calling Vision API to OCR Page Images");
                                VisionAPIHelper.OCRPage(page, log);
                            }
                            else
                            {
                                log.Info("Too many Page Images for Vision API");
                            }
                        }
                    }
                    try
                    {
                        page.PageText = PdfTextExtractor.GetTextFromPage(reader, i, new SimpleTextExtractionStrategy());
                    }
                    catch (System.ArgumentException ex)
                    {
                        log.Error(string.Format("Page {0} Text Processing Exception", i), ex);
                    }

                    result.Add(page);
                }
            }
            return(result);
        }
Exemple #4
0
        public static IEnumerable <(Image image, string type)> GetImages(string filePath)
        {
            using (var reader = new PdfReader(filePath))
            {
                var parser         = new PdfReaderContentParser(reader);
                var renderListener = new ImageRenderListener();
                for (int i = 1; i <= reader.NumberOfPages; i++)
                {
                    parser.ProcessContent(i, renderListener);
                }

                return(renderListener.Buffer
                       .Select(info => (info.GetDrawingImage(), info.GetFileType())).ToArray());
            }
        }
        public static PDFFileInfo GetPDFMetaData(Stream pdfStream, string filePath, bool deepinfo)
        {
            var fileInfo = new PDFFileInfo();

            fileInfo.FilePath = filePath;
            fileInfo.FileName = System.IO.Path.GetFileName(filePath);
            if (pdfStream.Length == 0)
            {
                fileInfo.ErrorMessages.Add("Zero length file");
                return(fileInfo);
            }

            try
            {
                pdfStream.Position = 0; // Ensure that we are at the start
            }
            catch (NotSupportedException) { }

            try
            {
                // Note: PdfReader Dispose closes the stream...
                using (PdfReader reader = new PdfReader(pdfStream))
                {
                    fileInfo.PDFVersion = reader.PdfVersion;
                    fileInfo.PageCount  = reader.NumberOfPages;
                    fileInfo.FileSize   = reader.FileLength;
                    if (reader.Info != null)
                    {
                        try
                        {
                            if (reader.Info.ContainsKey("CreationDate"))
                            {
                                fileInfo.CreationDate = PdfDate.Decode(reader.Info["CreationDate"]);
                            }
                            if (reader.Info.ContainsKey("ModDate"))
                            {
                                fileInfo.ModDate = PdfDate.Decode(reader.Info["ModDate"]);
                            }
                        }
                        catch (Exception ex)
                        {
                            fileInfo.ErrorMessages.Add($"PdfDate Decode {ex.Message.Replace(',',' ')}");
                        }
                        if (reader.Info.ContainsKey("Creator"))
                        {
                            fileInfo.Creator = reader.Info["Creator"];
                        }
                        if (reader.Info.ContainsKey("Producer"))
                        {
                            fileInfo.Producer = reader.Info["Producer"];
                        }
                    }

                    if (deepinfo)
                    {
                        var parser = new PdfReaderContentParser(reader);
                        ImageRenderListener listener = null;

                        for (var i = 1; i <= reader.NumberOfPages; i++)
                        {
                            var page = new PDFPageInfo();
                            try
                            {
                                parser.ProcessContent(i, (listener = new ImageRenderListener(fileInfo.ErrorMessages)));
                            }
                            catch (Exception ex)
                            {
                                fileInfo.ErrorMessages.Add($"Page {i} Image Processing Exception: {ex.Message.Replace(',', ' ')}");
                            }

                            try
                            {
                                var pageText = PdfTextExtractor.GetTextFromPage(reader, i, new SimpleTextExtractionStrategy());
                                page.TextCharacters += pageText.Length;
                            }
                            catch (System.ArgumentException ex)
                            {
                                fileInfo.ErrorMessages.Add($"Page {i} Text Extraction Exception {ex.Message.Replace(',', ' ')}");
                            }

                            page.PageNum    = i;
                            page.ImageCount = listener.Images.Count;
                            page.Images     = listener.Images;
                            for (int j = 0; j < page.ImageCount; j++)
                            {
                                page.Images[j].ImageNum = j + 1;
                                page.ImageBytes        += page.Images[j].ImageBytes;
                            }

                            fileInfo.Pages.Add(page);
                            fileInfo.ImageCount     += page.ImageCount;
                            fileInfo.ImageBytes     += page.ImageBytes;
                            fileInfo.TextCharacters += page.TextCharacters;
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                fileInfo.ErrorMessages.Add(ex.Message);
            }

            return(fileInfo);
        }