예제 #1
0
        /// BT = Beginning of a text object operator
        /// ET = End of a text object operator
        /// Td move to the start of next line
        ///  5 Ts = superscript
        /// -5 Ts = subscript
        ///

        public List <byte[]> ExtractImages(string inFileName, ImageFormat imageFormat, int minimumHeight, int minumumWidth)
        {
            List <byte[]> extractedImages = new List <byte[]>();

            PdfReader pdfReader = new PdfReader(inFileName);

            for (int pageNumber = 1; pageNumber <= pdfReader.NumberOfPages; pageNumber++)
            {
                PdfDictionary pdfDictionary = pdfReader.GetPageN(pageNumber);
                PdfDictionary res           = (PdfDictionary)PdfReader.GetPdfObject(pdfDictionary.Get(PdfName.RESOURCES));
                PdfDictionary xobj          = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));

                foreach (PdfName name in xobj.Keys)
                {
                    PdfObject obj = xobj.Get(name);

                    if (obj.IsIndirect())
                    {
                        PdfDictionary   tg     = (PdfDictionary)PdfReader.GetPdfObject(obj);
                        string          width  = tg.Get(PdfName.WIDTH).ToString();
                        string          height = tg.Get(PdfName.HEIGHT).ToString();
                        ImageRenderInfo imgRI  = ImageRenderInfo.CreateForXObject(new Matrix(float.Parse(width), float.Parse(height)), (PRIndirectReference)obj, tg);

                        PdfImageObject pdfImageObject = imgRI.GetImage();

                        using (Image image = pdfImageObject.GetDrawingImage())
                        {
                            if (image.Height >= minimumHeight && image.Width >= minumumWidth)
                            {
                                if (pdfImageObject.GetDrawingImage() != null)
                                {
                                    using (MemoryStream ms = new MemoryStream())
                                    {
                                        image.Save(ms, imageFormat);

                                        extractedImages.Add(ms.ToArray());
                                    }
                                }
                            }
                        }
                    }
                }
            }

            pdfReader.Close();

            return(extractedImages);
        }
예제 #2
0
        private string GetStreamType(PdfObject obj)
        {
            if (obj == null)
            {
                return("unknown");
            }

            if (obj.IsStream())
            {
                var stream = (PRStream)obj;
                try
                {
                    var pdfImage     = new PdfImageObject(stream);
                    var drawingImage = pdfImage.GetDrawingImage();
                    if (pdfImage != null)
                    {
                        return($"{drawingImage.Width}x{drawingImage.Height} {pdfImage.GetFileType()} Image");
                    }
                }
                catch (Exception ex)
                {
                    var sb = new StringBuilder();
                    foreach (var item in stream.Keys)
                    {
                        var streamKeyValue = stream.Get(item);
                        sb.Append(item + ":" + streamKeyValue + "  ");
                    }
                    return(sb.ToString());
                }
            }


            return(string.Empty);
        }
예제 #3
0
        /// <summary>
        /// Gets all the images in the given document
        /// </summary>
        /// <param name="file"></param>
        private List <Image> ExtractImages(string file)
        {
            var          randomAccess = new RandomAccessFileOrArray(file);
            var          reader       = new PdfReader(randomAccess, null);
            List <Image> imgList      = new List <Image>();

            for (int i = 0; i <= reader.XrefSize - 1; i++)
            {
                var pdfObject = reader.GetPdfObject(i);

                if ((pdfObject != null) && pdfObject.IsStream())
                {
                    var       PDFStremObj = (PdfStream)pdfObject;
                    PdfObject subtype     = PDFStremObj.Get(PdfName.SUBTYPE);

                    if ((subtype != null) && subtype.ToString() == PdfName.IMAGE.ToString())
                    {
                        PdfImageObject PdfImageObj = new PdfImageObject((PRStream)PDFStremObj);

                        Image ImgPDF = PdfImageObj.GetDrawingImage();


                        imgList.Add(ImgPDF);
                    }
                }
            }

            reader.Close();
            return(imgList);
        }
        public void RenderImage(ImageRenderInfo renderInfo)
        {
            PdfImageObject image  = renderInfo.GetImage();
            var            filter = (PdfName)image.Get(PdfName.FILTER);

            if (filter != null)
            {
                System.Drawing.Image drawingImage = image.GetDrawingImage();
                string extension = ".";

                if (filter == PdfName.DCTDECODE)
                {
                    extension += PdfImageObject.ImageBytesType.JPG.FileExtension;
                }
                else if (filter == PdfName.JPXDECODE)
                {
                    extension += PdfImageObject.ImageBytesType.JP2.FileExtension;
                }
                else if (filter == PdfName.FLATEDECODE)
                {
                    extension += PdfImageObject.ImageBytesType.PNG.FileExtension;
                }
                else if (filter == PdfName.LZWDECODE)
                {
                    extension += PdfImageObject.ImageBytesType.CCITT.FileExtension;
                }


                this.Images.Add(drawingImage, extension);
                string path = System.IO.Path.GetFullPath(System.AppDomain.CurrentDomain.BaseDirectory);
                drawingImage.Save(path + "temp" + extension, drawingImage.RawFormat);
            }
        }
예제 #5
0
        /* ----------------------------------------------------------------- */
        ///
        /// RenderImage
        ///
        /// <summary>
        /// Occurs when the specified image is rendered.
        /// </summary>
        ///
        /* ----------------------------------------------------------------- */
        public void RenderImage(ImageRenderInfo info)
        {
            var obj = info.GetImage();

            if (!(obj.Get(PdfName.FILTER) is PdfName))
            {
                return;
            }

            var raw = obj.GetDrawingImage();

            if (raw == null)
            {
                return;
            }

            var sm = obj.GetDictionary().GetDirectObject(PdfName.SMASK);

            if (sm == null)
            {
                _inner.Add(raw);
                return;
            }

            var tmp  = new PdfImageObject(sm as PRStream);
            var mask = tmp.GetDrawingImage();
            var dest = Restore(raw as Bitmap, mask as Bitmap);

            _inner.Add(dest ?? raw);
        }
예제 #6
0
        public void RenderImage(ImageRenderInfo renderInfo)
        {
            PdfImageObject image  = renderInfo.GetImage();
            PdfName        filter = (PdfName)image.Get(PdfName.FILTER);

            if (filter != null)
            {
                System.Drawing.Image drawingImage = image.GetDrawingImage();
                string extension = ".";
                if (filter == PdfName.DCTDECODE)
                {
                    extension += PdfImageObject.ImageBytesType.JPG.FileExtension;
                }
                else if (filter == PdfName.JPXDECODE)
                {
                    extension += PdfImageObject.ImageBytesType.JP2.FileExtension;
                }
                else if (filter == PdfName.FLATEDECODE)
                {
                    extension += PdfImageObject.ImageBytesType.PNG.FileExtension;
                }
                else if (filter == PdfName.LZWDECODE)
                {
                    extension += PdfImageObject.ImageBytesType.CCITT.FileExtension;
                }
                this.Images.Add(drawingImage, extension);
            }
        }
예제 #7
0
            public void RenderImage(ImageRenderInfo renderInfo)
            {
                PdfImageObject image        = null;
                Image          drawingImage = null;

                try
                {
                    image = renderInfo.GetImage();
                    var imgBytesLen = image.GetImageAsBytes().Length;
                    // Smallest image we can OCR is 40 x 40
                    if (imgBytesLen > 1600)
                    {
                        drawingImage = image.GetDrawingImage();
                    }
                }
                catch (Exception ex)
                {
                    _log.Error("Exception in GetImage or GetDrawingImage: {0}", ex);
                }

                if (drawingImage != null)
                {
                    this.Images.Add(drawingImage);
                }
            }
예제 #8
0
        public void RenderImage(ImageRenderInfo renderInfo)
        {
            PdfImageObject image = renderInfo.GetImage();
            ImageInfo      eii   = new ImageInfo();
            Matrix         m     = renderInfo.GetImageCTM();

            try
            {
                Image  drawingImage = image.GetDrawingImage();
                double hPoints      = m[0];
                double vPoints      = m[4];

                //72 Points = 1 inch so...
                double widthInches  = hPoints / 72;
                double heightInches = vPoints / 72;
                double hDPI         = drawingImage.Width / widthInches;
                double vDPI         = drawingImage.Height / heightInches;

                eii.hDPI        = Math.Round(hDPI);
                eii.vDPI        = Math.Round(vDPI);
                eii.width       = drawingImage.Width;
                eii.height      = drawingImage.Height;
                eii.pixelFormat = drawingImage.PixelFormat;
            }
            catch (Exception e)
            {
                //It was not possible to extract image with image.GetDrawingImage();
                //Don't throw exception to continue parsing the document
                log.Warn(e.Message, e);
            }
            ImagesInfo.Add(eii);
        }
예제 #9
0
        /// <summary>
        ///  Extract Image from PDF file and Store in Image Object
        /// </summary>
        /// <param name="pdfPath">Specify PDF Source Path</param>
        /// <returns>List</returns>
        public static List <Image> ExtractImages(string pdfPath)
        {
            var images    = new List <Image>();
            var rafObj    = new RandomAccessFileOrArray(pdfPath);
            var pdfReader = new PdfReader(rafObj, null);

            for (int i = 0; i < pdfReader.XrefSize; i++)
            {
                var pdfObject = pdfReader.GetPdfObject(i);

                if ((pdfObject != null) && pdfObject.IsStream())
                {
                    var pdfStream = (PdfStream)pdfObject;
                    var subtype   = pdfStream.Get(PdfName.SUBTYPE);

                    if ((subtype != null) && subtype.ToString() == PdfName.IMAGE.ToString())
                    {
                        var pdfImageObj = new PdfImageObject((PRStream)pdfStream);
                        var image       = pdfImageObj.GetDrawingImage();
                        images.Add(image);
                    }
                }
            }

            pdfReader.Close();

            return(images);
        }
예제 #10
0
        public void RenderImage(ImageRenderInfo renderInfo)
        {
            try
            {
                PdfImageObject image = renderInfo.GetImage();

                if (image != null)//Added By DS To Handle Exception
                {
                    //PdfName filter = (PdfName)image.Get(PdfName.FILTER);// Commented By DS To Handle Exception
                    PdfName filter = null;

                    //int width = Convert.ToInt32(image.Get(PdfName.WIDTH).ToString());
                    //int bitsPerComponent = Convert.ToInt32(image.Get(PdfName.BITSPERCOMPONENT).ToString());
                    //string subtype = image.Get(PdfName.SUBTYPE).ToString();
                    //int height = Convert.ToInt32(image.Get(PdfName.HEIGHT).ToString());
                    //int length = Convert.ToInt32(image.Get(PdfName.LENGTH).ToString());
                    //string colorSpace = image.Get(PdfName.COLORSPACE).ToString();

                    /* It appears to be safe to assume that when filter == null, PdfImageObject
                     * does not know how to decode the image to a System.Drawing.Image.
                     *
                     * Uncomment the code above to verify, but when I've seen this happen,
                     * width, height and bits per component all equal zero as well. */

                    //if (filter != null)// Commented By DS To Handle Exception
                    //{
                    System.Drawing.Image drawingImage = image.GetDrawingImage();

                    string extension = ".";

                    if (filter == PdfName.DCTDECODE)
                    {
                        extension += PdfImageObject.ImageBytesType.JPG.FileExtension;
                    }
                    else if (filter == PdfName.JPXDECODE)
                    {
                        extension += PdfImageObject.ImageBytesType.JP2.FileExtension;
                    }
                    else if (filter == PdfName.FLATEDECODE)
                    {
                        extension += PdfImageObject.ImageBytesType.PNG.FileExtension;
                    }
                    else if (filter == PdfName.LZWDECODE)
                    {
                        extension += PdfImageObject.ImageBytesType.CCITT.FileExtension;
                    }

                    /* Rather than struggle with the image stream and try to figure out how to handle
                     * BitMapData scan lines in various formats (like virtually every sample I've found
                     * online), use the PdfImageObject.GetDrawingImage() method, which does the work for us. */
                    this.Images.Add(drawingImage, extension);
                    //}// Commented By DS To Handle Exception
                }
            }
            catch (Exception ex)
            {
                throw ex;
            }
        }
예제 #11
0
    public void RenderImage(ImageRenderInfo renderInfo)
    {
        PdfImageObject imageObject = renderInfo.GetImage();

        if (imageObject == null)
        {
            Console.WriteLine("Image {0} could not be read.", renderInfo.GetRef().Number);
        }
        else
        {
            Images.Add(imageObject.GetDrawingImage());
        }
    }
예제 #12
0
 private bool TryToReadImage(PRStream stream)
 {
     try
     {
         var pdfImage = new PdfImageObject(stream);
         picImage.Image = null;
         picImage.Image = pdfImage.GetDrawingImage();
         tsMessage.Text = "Image Size = " + stream.Length;
         return(true);
     }
     catch (Exception e)
     {
         return(false);
     }
 }
예제 #13
0
        ////////////////////////////////////////////////////////////////////////////////////////////////////

        public static List <System.Drawing.Image> ExtractImages(string PDFSourcePath)
        {
            List <System.Drawing.Image> imgList = new List <System.Drawing.Image>();

            RandomAccessFileOrArray RAFObj = null;
            PdfReader PDFReaderObj         = null;
            PdfObject PDFObj      = null;
            PdfStream PDFStremObj = null;

            try
            {
                RAFObj       = new RandomAccessFileOrArray(PDFSourcePath);
                PDFReaderObj = new PdfReader(RAFObj, null);
                for (int i = 0; i <= PDFReaderObj.XrefSize - 1; i++)
                {
                    PDFObj = PDFReaderObj.GetPdfObject(i);

                    if ((PDFObj != null) && PDFObj.IsStream())
                    {
                        PDFStremObj = (PdfStream)PDFObj;
                        iTextSharp.text.pdf.PdfObject subtype = PDFStremObj.Get(PdfName.SUBTYPE);

                        if ((subtype != null) && subtype.ToString() == PdfName.IMAGE.ToString())
                        {
                            try
                            {
                                PdfImageObject PdfImageObj = new PdfImageObject((PRStream)PDFStremObj);

                                System.Drawing.Image ImgPDF = PdfImageObj.GetDrawingImage();


                                imgList.Add(ImgPDF);
                            }
                            catch (Exception)
                            {
                            }
                        }
                    }
                }
                PDFReaderObj.Close();
            }
            catch (Exception ex)
            {
                throw new Exception(ex.Message);
            }
            return(imgList);
        }
예제 #14
0
        public void RenderImage(ImageRenderInfo renderInfo)
        {
            PdfImageObject pdfimage = renderInfo.GetImage();
            string         pp       = pdfimage.GetImageBytesType().FileExtension;

            string imgtp = pdfimage.GetFileType();

            if ("jpg".Equals(imgtp, StringComparison.OrdinalIgnoreCase) ||
                "png".Equals(imgtp, StringComparison.OrdinalIgnoreCase) ||
                "gif".Equals(imgtp, StringComparison.OrdinalIgnoreCase))
            {
            }
            else
            {
                imgtp = pp;
                imgtp = "png";
            }
            String imageFileName = String.Format("{0}_{1:000}_{2}.{3}", _imgname, _currentPage, _imgNo, imgtp);

//			imageFileName= _outputFolder+"\\"+_imgname+_currentPage.ToString("_000")+"_"+_imgNo+".png";
            imageFileName = _outputFolder + "\\" + imageFileName;
            try{
                using (Image dotnetImg = pdfimage.GetDrawingImage())
                {
                    if (dotnetImg != null)
                    {
                        if (_pageRotation == 270)
                        {
                            dotnetImg.RotateFlip(RotateFlipType.Rotate270FlipNone);
                        }
                        else if (_pageRotation == 90)
                        {
                            dotnetImg.RotateFlip(RotateFlipType.Rotate90FlipNone);
                        }
                        else if (_pageRotation == 180)
                        {
                            dotnetImg.RotateFlip(RotateFlipType.Rotate180FlipNone);
                        }
                        dotnetImg.Save(imageFileName);
                    }
                }
            }catch (Exception ee) {
                System.Diagnostics.Debug.WriteLine(ee.StackTrace);
            }
            _imgNo++;
        }
예제 #15
0
        private static Tuple <string, System.Drawing.Image> GetImage(PdfName filter, PdfImageObject pdfImageObject)
        {
            Tuple <string, System.Drawing.Image> image = null;

            System.Drawing.Image drawingImage = pdfImageObject.GetDrawingImage();

            string extension = ".";

            if (Equals(filter, PdfName.DCTDECODE))
            {
                Trace.TraceInformation("JPG image detected");
                extension += PdfImageObject.ImageBytesType.JPG.FileExtension;
            }
            else if (Equals(filter, PdfName.JBIG2DECODE))
            {
                Trace.TraceInformation("JBIG2 extension detected");
                extension += PdfImageObject.ImageBytesType.JBIG2.FileExtension;
            }
            else if (Equals(filter, PdfName.JPXDECODE))
            {
                Trace.TraceInformation("JP2 extension detected");
                extension += PdfImageObject.ImageBytesType.JP2.FileExtension;
            }
            else if (Equals(filter, PdfName.FLATEDECODE))
            {
                Trace.TraceInformation("PNG image detected");
                extension += PdfImageObject.ImageBytesType.PNG.FileExtension;
            }
            else if (Equals(filter, PdfName.LZWDECODE))
            {
                Trace.TraceInformation("LZWDECODE extension detected");
                extension += PdfImageObject.ImageBytesType.CCITT.FileExtension;
            }
            else if (Equals(filter, PdfName.CCITTFAXDECODE))
            {
                Trace.TraceInformation("CCITTFAXDECODE extension detected");
                extension += PdfImageObject.ImageBytesType.CCITT.FileExtension;
            }
            else
            {
                Debug.WriteLine("Unknown type: " + filter);
                Trace.TraceInformation("Unknown type: " + filter);
            }

            return(new Tuple <string, System.Drawing.Image>(extension, drawingImage));
        }
예제 #16
0
        public static List <Stream> ExtractImagesFromPDF(byte[] sourcePdf, TraceWriter log)
        {
            List <Stream>  imgList = new List <Stream>();
            PdfReader      reader  = new PdfReader(sourcePdf);
            PRStream       prStream;
            PdfImageObject pdfImgObject;
            PdfObject      pdfObject;

            int n = reader.XrefSize;

            try
            {
                for (int i = 0; i < n; i++)
                {
                    pdfObject = reader.GetPdfObject(i);
                    if (pdfObject == null || !pdfObject.IsStream())
                    {
                        continue;
                    }

                    prStream = (PRStream)pdfObject;
                    PdfObject type = prStream.Get(PdfName.SUBTYPE);

                    if (type != null && type.ToString().Equals(PdfName.IMAGE.ToString()))
                    {
                        pdfImgObject = new PdfImageObject(prStream);

                        var image = pdfImgObject.GetDrawingImage();

                        // only add images larger than 50x50 for OCR processing
                        if (image.Height >= 50 && image.Width >= 50)
                        {
                            byte[]       imgdata   = pdfImgObject.GetImageAsBytes();
                            MemoryStream memStream = new MemoryStream(imgdata);
                            imgList.Add(memStream);
                        }
                    }
                }
            }
            catch (Exception e)
            {
                log.Error(e.Message);
            }

            return(imgList);
        }
        private void RenderImage(ImageRenderInfo renderInfo)
        {
            PdfImageObject image = renderInfo.GetImage();

            using (Dotnet dotnetImg = image.GetDrawingImage())
            {
                if (dotnetImg != null)
                {
                    using (MemoryStream ms = new MemoryStream())
                    {
                        dotnetImg.Save(ms, ImageFormat.Tiff);
                        Bitmap d = new Bitmap(dotnetImg);
                        d.Save(imgPath);
                    }
                }
            }
        }
        public void RenderImage(ImageRenderInfo renderInfo)
        {
            PdfImageObject image  = renderInfo.GetImage();
            PdfName        filter = (PdfName)image.Get(PdfName.FILTER);

            if (filter != null)
            {
                try
                {
                    Image drawingImage = image.GetDrawingImage();
                    _images.Add(drawingImage);
                }
                catch (Exception)
                {
                    // _log.Error(e.Message);
                }
            }
        }
            public void RenderImage(ImageRenderInfo info)
            {
                PdfImageObject image    = info.GetImage();
                var            fileType = image.GetFileType();
                var            imgBytes = image.GetImageAsBytes();
                var            imgDict  = image.GetDictionary();
                var            imgInfo  = "Unknown";
                var            filter   = image.Get(PdfName.FILTER);

                if (filter != null)
                {
                    imgInfo = filter.ToString().Replace(',', ' ');
                }

                var ctm       = info.GetImageCTM();
                var ctmWidth  = ctm[Matrix.I11];
                var ctmHeight = ctm[Matrix.I22];

                int         imgWidth      = -1;
                int         imgHeight     = -1;
                int         imgResolution = -1;
                PixelFormat imgFormat     = PixelFormat.Undefined;

                if (imgInfo != "/JBIG2Decode" && imgInfo != "/JPXDecode")
                {
                    var img = image.GetDrawingImage();
                    imgWidth      = img.Width;
                    imgHeight     = img.Height;
                    imgFormat     = img.PixelFormat;
                    imgResolution = Convert.ToInt32(img.VerticalResolution);
                    img.Dispose();
                }

                Images.Add(new PDFImageInfo()
                {
                    ImageBytes  = imgBytes.Length, ImageFormat = imgFormat.ToString(),
                    ImageHeight = imgHeight, ImageWidth = imgWidth, ImageResolution = imgResolution,
                    ImageInfo   = imgInfo, ImageType = fileType
                });
            }
예제 #20
0
 private void ExtractImageOLD(PdfImageObject pdfimage, string path, int pageRotation)
 {
     using (Image dotnetImg = pdfimage.GetDrawingImage())
     {
         if (dotnetImg != null)
         {
             if (pageRotation == 270)
             {
                 dotnetImg.RotateFlip(RotateFlipType.Rotate270FlipNone);
             }
             else if (pageRotation == 90)
             {
                 dotnetImg.RotateFlip(RotateFlipType.Rotate90FlipNone);
             }
             else if (pageRotation == 180)
             {
                 dotnetImg.RotateFlip(RotateFlipType.Rotate180FlipNone);
             }
             dotnetImg.Save(path);
         }
     }
 }
예제 #21
0
        private void btnRemoveObject_Click(object sender, EventArgs e)
        {
            foreach (var item in lstSelectedImages.SelectedItems)
            {
                if (item is KeyValuePair <string, object> )
                {
                    var thisItem = (KeyValuePair <string, object>)item;
                    var obj      = thisItem.Value as PdfObject;
                    if (obj != null && obj.IsStream())
                    {
                        var    stream = (PRStream)obj;
                        byte[] b;
                        try
                        {
                            b = PdfReader.GetStreamBytes(stream);
                        }
                        catch (Exception ex1)
                        {
                            b = PdfReader.GetStreamBytesRaw(stream);
                        }

                        var bytes = b;
                        try
                        {
                            var pdfImage = new PdfImageObject(stream);
                            picImage.Image = pdfImage.GetDrawingImage();
                            picImage.Image.Save(Path.GetDirectoryName(fileName) + "\\output\\" + DateTime.Now.Ticks.ToString() + "." + pdfImage.GetFileType());
                            PdfImage image = new PdfImage(MakeBlankImg(), "", null);
                            ReplaceStream(stream, image);
                        }
                        catch (Exception ex)
                        {
                            tsMessage.Text = ex.Message;
                        }
                    }
                }
                SaveReaderToOutput();
            }
        }
예제 #22
0
        /// <summary>
        /// https://stackoverflow.com/questions/802269/extract-images-using-itextsharp
        /// </summary>
        internal static IList <Image> GetImagesFromPdfDict(PdfDictionary dict, PdfReader doc)
        {
            var images = new List <Image>();
            var res    = (PdfDictionary)(PdfReader.GetPdfObject(dict.Get(PdfName.RESOURCES)));
            var xobj   = (PdfDictionary)(PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT)));

            if (xobj == null)
            {
                return(images);
            }
            foreach (var name in xobj.Keys)
            {
                var obj = xobj.Get(name);
                if (!obj.IsIndirect())
                {
                    continue;
                }
                var tg      = (PdfDictionary)(PdfReader.GetPdfObject(obj));
                var subtype = (PdfName)(PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE)));
                if (PdfName.IMAGE.Equals(subtype))
                {
                    var xrefIdx = ((PRIndirectReference)obj).Number;
                    var pdfObj  = doc.GetPdfObject(xrefIdx);
                    var str     = (PdfStream)(pdfObj);

                    var pdfImage =
                        new PdfImageObject((PRStream)str);
                    var img = pdfImage.GetDrawingImage();

                    images.Add(img);
                }
                else if (PdfName.FORM.Equals(subtype) || PdfName.GROUP.Equals(subtype))
                {
                    images.AddRange(GetImagesFromPdfDict(tg, doc));
                }
            }

            return(images);
        }
예제 #23
0
        private static void FindImages(string source, string target)
        {
            using (var pdf = new PdfReader(source))
            {
                for (int pageNumber = 1, imageNumber = 1; pageNumber <= pdf.NumberOfPages; pageNumber++, imageNumber = 1)
                {
                    FindPageImages(pdf.GetPageN(pageNumber), obj =>
                    {
                        if (obj == null)
                        {
                            return;
                        }

                        var pdfObj = pdf.GetPdfObject(((PRIndirectReference)obj).Number);
                        if (pdfObj == null || !pdfObj.IsStream())
                        {
                            return;
                        }

                        var stream  = (PdfStream)pdfObj;
                        var subtype = stream.Get(PdfName.SUBTYPE);

                        if (subtype == null || !subtype.Equals(PdfName.IMAGE))
                        {
                            return;
                        }

                        var imageObj = new PdfImageObject((PRStream)stream);

                        using (var image = imageObj.GetDrawingImage())
                        {
                            image.Save(Path.Combine(target, $"Image {pageNumber} - {imageNumber++}.{imageObj.GetFileType()}"));
                        }
                    });
                }
            }
        }
예제 #24
0
        public void RenderImage(ImageRenderInfo renderInfo)
        {
            PdfImageObject image = renderInfo.GetImage();

            //int width = Convert.ToInt32(image.Get(PdfName.WIDTH).ToString());
            //int bitsPerComponent = Convert.ToInt32(image.Get(PdfName.BITSPERCOMPONENT).ToString());
            //string subtype = image.Get(PdfName.SUBTYPE).ToString();
            //int height = Convert.ToInt32(image.Get(PdfName.HEIGHT).ToString());
            //int length = Convert.ToInt32(image.Get(PdfName.LENGTH).ToString());
            //string colorSpace = image.Get(PdfName.COLORSPACE).ToString();

            /* It appears to be safe to assume that when filter == null, PdfImageObject
             * does not know how to decode the image to a System.Drawing.Image.
             *
             * Uncomment the code above to verify, but when I've seen this happen,
             * width, height and bits per component all equal zero as well. */

            System.Drawing.Image drawingImage = image.GetDrawingImage();

            /* Rather than struggle with the image stream and try to figure out how to handle
             * BitMapData scan lines in various formats (like virtually every sample I've found
             * online), use the PdfImageObject.GetDrawingImage() method, which does the work for us. */
            this.Images.Add(drawingImage, PdfImageObject.ImageBytesType.PNG.FileExtension);
        }
예제 #25
0
        /// <summary>
        /// Procesa un PDF
        /// </summary>
        private static MemoryStream PDFCompress2(Stream fileStream)
        {
            BinaryReader br = new BinaryReader(fileStream);

            byte[]       byt = br.ReadBytes((int)fileStream.Length);
            MemoryStream ms  = new MemoryStream();

            PdfReader pdf = new PdfReader(byt);

            if (pdf.IsOpenedWithFullPermissions)
            {
                PdfStamper stp    = new PdfStamper(pdf, ms);
                PdfWriter  writer = stp.Writer;
                //
                int page_count = pdf.NumberOfPages;
                for (int i = 1; i <= page_count; i++)
                {
                    PdfDictionary pg   = pdf.GetPageN(i);
                    PdfDictionary res  = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
                    PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
                    if (xobj != null)
                    {
                        foreach (PdfName name in xobj.Keys)
                        {
                            PdfObject obj = xobj.Get(name);
                            if (obj.IsIndirect())
                            {
                                PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
                                if (tg != null)//Veo que a veces que si se trata varias veces la misma imagen esto se vuelve null
                                {
                                    PdfName type = (PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));
                                    if (PdfName.IMAGE.Equals(type))
                                    {
                                        int       xrefIdx = ((PRIndirectReference)obj).Number;
                                        PdfObject pdfObj  = pdf.GetPdfObject(xrefIdx);
                                        PdfStream str     = (PdfStream)pdfObj;



                                        string filter = string.Empty;
                                        if (tg.Get(PdfName.FILTER) != null)
                                        {
                                            filter = tg.Get(PdfName.FILTER).ToString();
                                        }
                                        else
                                        {
                                        }

                                        if (filter.Contains("/DCTDecode")) //Unas veces es "[/DCTDecode]" y otras "/DCTDecode"
                                        {
                                            try
                                            {
                                                iTextSharp.text.Image img = iTextSharp.text.Image.GetInstance((PRIndirectReference)obj);
                                                //byte[] bytes = PdfReader.GetStreamBytesRaw((PRStream)str);
                                                //System.Drawing.Image imgOriginal = System.Drawing.Image.FromStream(new MemoryStream(bytes));
                                                PdfImageObject pdfImage = new PdfImageObject((PRStream)str);

                                                using (System.Drawing.Image imgOriginal = pdfImage.GetDrawingImage())
                                                    using (System.Drawing.Image img2 = Globals.ResizeImage(imgOriginal))
                                                    {
                                                        if (img2.Width != imgOriginal.Width || img2.Height != imgOriginal.Height)
                                                        {
                                                            //img2 = Resize(img2, maxImageWidth, maxImageHeight);
                                                            var stream = new System.IO.MemoryStream();
                                                            img2.Save(stream, ImageFormat.Jpeg);
                                                            stream.Position = 0;
                                                            PdfReader.KillIndirect(obj);
                                                            img = iTextSharp.text.Image.GetInstance(stream);

                                                            writer.AddDirectImageSimple(img, (PRIndirectReference)obj);
                                                        }
                                                    }
                                            }
                                            catch (Exception ex)
                                            {
                                                throw ex;
                                            }
                                            break;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }

                stp.Writer.CloseStream = false;
                stp.FormFlattening     = true;
                stp.Close();
                pdf.Close();
                //return ms;
            }
            else
            {
                ms = null;
            }
            return(ms);
        }
예제 #26
0
// ---------------------------------------------------------------------------

        /**
         * Manipulates a PDF file src with the byte array as result
         */
        public byte[] ManipulatePdf(byte[] pdf)
        {
            PdfName key   = new PdfName("ITXT_SpecialId");
            PdfName value = new PdfName("123456789");
            // Read the file
            PdfReader reader = new PdfReader(pdf);
            int       n      = reader.XrefSize;
            PdfObject pdfObject;
            PRStream  prStream;

            // Look for image and manipulate image prStream
            for (int i = 0; i < n; i++)
            {
                pdfObject = reader.GetPdfObject(i);
                if (pdfObject == null || !pdfObject.IsStream())
                {
                    continue;
                }

                prStream = (PRStream)pdfObject;
                byte[] imageBytes;
                if (value.Equals(prStream.Get(key)))
                {
                    PdfImageObject image = new PdfImageObject(prStream);
                    using (System.Drawing.Image original = image.GetDrawingImage()) {
                        if (original == null)
                        {
                            continue;
                        }
                        int width  = (int)(original.Width * FACTOR);
                        int height = (int)(original.Height * FACTOR);

                        using (System.Drawing.Image thumb = new Bitmap(width, height)) {
                            using (Graphics graphic = Graphics.FromImage(thumb)) {
                                graphic.DrawImage(original, 0, 0, width, height);
                                using (MemoryStream ms = new MemoryStream()) {
                                    thumb.Save(ms, ImageFormat.Jpeg);
                                    imageBytes = ms.ToArray();
                                }
                            }
                        }
                        prStream.Clear();
                        prStream.SetData(imageBytes, false, PRStream.NO_COMPRESSION);
                        prStream.Put(PdfName.TYPE, PdfName.XOBJECT);
                        prStream.Put(PdfName.SUBTYPE, PdfName.IMAGE);
                        prStream.Put(key, value);
                        prStream.Put(PdfName.FILTER, PdfName.DCTDECODE);
                        prStream.Put(PdfName.WIDTH, new PdfNumber(width));
                        prStream.Put(PdfName.HEIGHT, new PdfNumber(height));
                        prStream.Put(PdfName.BITSPERCOMPONENT, new PdfNumber(8));
                        prStream.Put(PdfName.COLORSPACE, PdfName.DEVICERGB);
                    }
                }
            }
            // Save altered PDF
            using (MemoryStream ms = new MemoryStream()) {
                using (PdfStamper stamper = new PdfStamper(reader, ms)) {
                }
                return(ms.ToArray());
            }
        }
예제 #27
0
        public void RenderImage(ImageRenderInfo renderInfo)
        {
            PdfImageObject image = renderInfo.GetImage();


            var v = PdfName.FILTER;

            //PdfArray array = new PdfArray();
            //array.Add(PdfName.FLATEDECODE);
            //array.Add(PdfName.DCTDECODE);
            //imgStream.put(PdfName.FILTER, array);
            //PdfName filter = (PdfName)image.Get(PdfName.FILTER);
            PdfName filter           = (PdfName)image.Get(PdfName.FIRST);
            int     width            = Convert.ToInt32(image.Get(PdfName.WIDTH).ToString());
            int     bitsPerComponent = Convert.ToInt32(image.Get(PdfName.BITSPERCOMPONENT).ToString());
            string  subtype          = image.Get(PdfName.SUBTYPE).ToString();
            int     height           = Convert.ToInt32(image.Get(PdfName.HEIGHT).ToString());
            int     length           = Convert.ToInt32(image.Get(PdfName.LENGTH).ToString());
            string  colorSpace       = image.Get(PdfName.COLORSPACE).ToString();

            /* It appears to be safe to assume that when filter == null, PdfImageObject
             * does not know how to decode the image to a System.Drawing.Image.
             *
             * Uncomment the code above to verify, but when I've seen this happen,
             * width, height and bits per component all equal zero as well. */
            //if (filter != null)
            //{
            Image drawingImage = image.GetDrawingImage();

            string extension = PdfImageObject.ImageBytesType.JPG.FileExtension;

            //if (filter == PdfName.DCTDECODE)
            //{
            //    extension += PdfImageObject.ImageBytesType.JPG.FileExtension;
            //}
            //else if (filter == PdfName.JPXDECODE)
            //{
            //    extension += PdfImageObject.ImageBytesType.JP2.FileExtension;
            //}
            //else if (filter == PdfName.FLATEDECODE)
            //{
            //    extension += PdfImageObject.ImageBytesType.PNG.FileExtension;
            //}
            //else if (filter == PdfName.LZWDECODE)
            //{
            //    extension += PdfImageObject.ImageBytesType.CCITT.FileExtension;
            //}

            /* Rather than struggle with the image stream and try to figure out how to handle
             * BitMapData scan lines in various formats (like virtually every sample I've found
             * online), use the PdfImageObject.GetDrawingImage() method, which does the work for us. */
            try
            {
                var pages = Program.NumberOfPagesPdf(Program.FilePhth);
                this.Images.Add(drawingImage, extension);
                string filename = @"C:\Images\" + pages + "\\";
                bool   exists   = System.IO.Directory.Exists(filename);
                PdfImageExtractor.increment++;
                if (!exists)
                {
                    System.IO.Directory.CreateDirectory(filename);
                }

                if (PdfImageExtractor.increment <= pages)
                {
                    string       fullName2 = filename + "im" + PdfImageExtractor.increment + ".JPG";
                    byte[]       byteArray = Encoding.UTF8.GetBytes(fullName2);
                    MemoryStream stream    = new MemoryStream();
                    drawingImage.Save(stream, drawingImage.RawFormat);
                    drawingImage.Save(fullName2, ImageFormat.Gif);
                    if (Program.invoice.images1.Any(m => m.ImageName == fullName2))
                    {
                    }
                    else
                    {
                        //sparar bilden till modellen
                        Program.invoice.images1.Add(new Models.Images()
                        {
                            ImageName = fullName2,

                            ImageData = stream.ToArray()
                        });
                    }
                }
            }
            catch (IOException e)
            {
                Console.WriteLine(e);
            }
        }
        public void RenderImage(ImageRenderInfo renderInfo)
        {
            PdfImageObject image  = renderInfo.GetImage();
            PdfName        filter = (PdfName)image.Get(PdfName.FILTER);

            //int width = Convert.ToInt32(image.Get(PdfName.WIDTH).ToString());
            //int bitsPerComponent = Convert.ToInt32(image.Get(PdfName.BITSPERCOMPONENT).ToString());
            //string subtype = image.Get(PdfName.SUBTYPE).ToString();
            //int height = Convert.ToInt32(image.Get(PdfName.HEIGHT).ToString());
            //int length = Convert.ToInt32(image.Get(PdfName.LENGTH).ToString());
            //string colorSpace = image.Get(PdfName.COLORSPACE).ToString();

            /* It appears to be safe to assume that when filter == null, PdfImageObject
             * does not know how to decode the image to a System.Drawing.Image.
             *
             * Uncomment the code above to verify, but when I’ve seen this happen,
             * width, height and bits per component all equal zero as well. */
            if (filter != null)
            {
                Matrix matrix = renderInfo.GetImageCTM();
                System.Drawing.Image drawingImage = image.GetDrawingImage();

                string extension = ".";
                float  x         = matrix[Matrix.I31];
                float  y         = matrix[Matrix.I32];
                float  w         = matrix[Matrix.I11];
                float  h         = matrix[Matrix.I22];

                if (filter == PdfName.DCTDECODE)
                {
                    extension += PdfImageObject.ImageBytesType.JPG.FileExtension;
                }
                else if (filter == PdfName.JPXDECODE)
                {
                    extension += PdfImageObject.ImageBytesType.JP2.FileExtension;
                }
                else if (filter == PdfName.FLATEDECODE)
                {
                    extension += PdfImageObject.ImageBytesType.PNG.FileExtension;
                }
                else if (filter == PdfName.LZWDECODE)
                {
                    extension += PdfImageObject.ImageBytesType.CCITT.FileExtension;
                }


                /* Rather than struggle with the image stream and try to figure out how to handle
                 * BitMapData scan lines in various formats (like virtually every sample I’ve found
                 * online), use the PdfImageObject.GetDrawingImage() method, which does the work for us. */
                //this.Images.Add(drawingImage, extension);
                images.Add(new PdfImage()
                {
                    X            = x,
                    Y            = y,
                    Width        = w,
                    Height       = h,
                    DrawingImage = drawingImage,
                    Extension    = extension
                });
            }
        }
예제 #29
0
        /// <summary>
        /// Gets image from PDF and compresses it - Found on StackOverflow - asis
        /// </summary>
        /// <param name="reader"></param>
        /// <param name="quality"></param>
        public static void ReduceResolution(PdfReader reader, long quality)
        {
            int n = reader.XrefSize;

            for (int i = 0; i < n; i++)
            {
                PdfObject obj = reader.GetPdfObject(i);
                if (obj == null || !obj.IsStream())
                {
                    continue;
                }

                PdfDictionary dict    = (PdfDictionary)PdfReader.GetPdfObject(obj);
                PdfName       subType = (PdfName)PdfReader.GetPdfObject(
                    dict.Get(PdfName.SUBTYPE)
                    );
                if (!PdfName.IMAGE.Equals(subType))
                {
                    continue;
                }

                PRStream stream = (PRStream)obj;
                try
                {
                    PdfImageObject image = new PdfImageObject(stream);
                    //PdfName filter = (PdfName)image.Get(PdfName.FILTER);
                    //if (
                    //  PdfName.JBIG2DECODE.Equals(filter)
                    //  || PdfName.JPXDECODE.Equals(filter)
                    //  || PdfName.CCITTFAXDECODE.Equals(filter)
                    //  || PdfName.FLATEDECODE.Equals(filter)
                    //) continue;

                    System.Drawing.Image img = image.GetDrawingImage();
                    if (img == null)
                    {
                        continue;
                    }

                    var ll     = image.GetImageBytesType();
                    int width  = img.Width;
                    int height = img.Height;
                    using (System.Drawing.Bitmap dotnetImg =
                               new System.Drawing.Bitmap(img))
                    {
                        // set codec to jpeg type => jpeg index codec is "1"
                        System.Drawing.Imaging.ImageCodecInfo codec =
                            System.Drawing.Imaging.ImageCodecInfo.GetImageEncoders()[1];
                        // set parameters for image quality
                        System.Drawing.Imaging.EncoderParameters eParams =
                            new System.Drawing.Imaging.EncoderParameters(1);
                        eParams.Param[0] =
                            new System.Drawing.Imaging.EncoderParameter(
                                System.Drawing.Imaging.Encoder.Quality, quality
                                );
                        using (MemoryStream msImg = new MemoryStream())
                        {
                            dotnetImg.Save(msImg, codec, eParams);
                            msImg.Position = 0;
                            stream.SetData(msImg.ToArray());
                            stream.SetData(
                                msImg.ToArray(), false, PRStream.BEST_COMPRESSION
                                );
                            stream.Put(PdfName.TYPE, PdfName.XOBJECT);
                            stream.Put(PdfName.SUBTYPE, PdfName.IMAGE);
                            stream.Put(PdfName.FILTER, image.Get(PdfName.FILTER));
                            stream.Put(PdfName.FILTER, PdfName.DCTDECODE);
                            stream.Put(PdfName.WIDTH, new PdfNumber(width));
                            stream.Put(PdfName.HEIGHT, new PdfNumber(height));
                            stream.Put(PdfName.BITSPERCOMPONENT, new PdfNumber(8));
                            stream.Put(PdfName.COLORSPACE, PdfName.DEVICERGB);
                        }
                    }
                }
                catch
                {
                    // throw;
                    // iText[Sharp] can't handle all image types...
                }
                finally
                {
                    // may or may not help
                    reader.RemoveUnusedObjects();
                }
            }
        }
예제 #30
0
파일: City.cs 프로젝트: cykb518hu/Scraper
        public void OCRPdf(bool rotate, string docPath, ref Documents doc)
        {
            PdfReader pdfReader = new PdfReader(docPath);
            int       totalPage = pdfReader.NumberOfPages;

            Console.WriteLine("Pdf file {0} contains {1} pages...", docPath, totalPage);
            List <int> pageNos = new List <int>();

            for (int i = 1; i <= totalPage; i++)
            {
                if (!doc.DocBodyDic.ContainsKey(i))
                {
                    pageNos.Add(i);
                }
            }

            foreach (int pageNumber in pageNos)
            {
                try
                {
                    Console.WriteLine("Working on page {0}...", pageNumber);
                    PdfReader     pdf  = new PdfReader(docPath);
                    PdfDictionary pg   = pdf.GetPageN(pageNumber);
                    PdfDictionary res  = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
                    PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
                    foreach (PdfName name in xobj.Keys)
                    {
                        PdfObject obj = xobj.Get(name);

                        if (obj.IsIndirect())
                        {
                            PdfDictionary tg          = (PdfDictionary)PdfReader.GetPdfObject(obj);
                            string        width       = tg.Get(PdfName.WIDTH).ToString();
                            float         widthValue  = float.Parse(width);
                            string        height      = tg.Get(PdfName.HEIGHT).ToString();
                            float         heightValue = -1;
                            bool          isDigit     = float.TryParse(height, out heightValue);
                            heightValue = isDigit ? heightValue : widthValue;

                            if (heightValue < 100 || widthValue < 100)
                            {
                                continue;
                            }

                            ImageRenderInfo imgRI         = ImageRenderInfo.CreateForXObject(new Matrix(float.Parse(width), heightValue), (PRIndirectReference)obj, tg);
                            PdfImageObject  image         = imgRI.GetImage();
                            string          imageFileName = string.Empty;

                            using (Image dotnetImg = image.GetDrawingImage())
                            {
                                if (dotnetImg != null)
                                {
                                    using (MemoryStream ms = new MemoryStream())
                                    {
                                        dotnetImg.Save(ms, ImageFormat.Jpeg);
                                    }
                                }

                                string ocrFolder = string.Format("{0}\\{1}", this.localDirectory, Path.GetFileNameWithoutExtension(docPath));

                                if (!Directory.Exists(ocrFolder))
                                {
                                    Directory.CreateDirectory(ocrFolder);
                                }

                                imageFileName = string.Format("{0}\\{1}\\Page_{2}.jpg", localDirectory, Path.GetFileNameWithoutExtension(docPath), pageNumber);
                                dotnetImg.Save(imageFileName);
                            }

                            //string text = RunOCRCommand(imageFileName);
                            string text = RetryText(imageFileName);

                            if ((!doc.DocBodyDic.ContainsKey(pageNumber)) && (!string.IsNullOrEmpty(text)))
                            {
                                doc.DocBodyDic.Add(pageNumber, text);
                            }
                            else
                            {
                                Console.ForegroundColor = ConsoleColor.Yellow;
                                Console.WriteLine("Page {0} could read...", pageNumber);
                                Console.ResetColor();
                            }
                        }
                    }
                }
                catch (Exception ex)
                {
                }
            }
            pdfReader.Close();
        }