Example #1
0
        private static void Main(string[] args)
        {
            var pdfPath    = "E:\\Embryo_Omics\\pdfExtract\\Binder1.pdf";
            var outputPath = "E:\\Embryo_Omics\\SingleSection_OldGrant\\OutTest\\Pngs";

            PdfReader pdf = new PdfReader(pdfPath);
            PdfReaderContentParser parser   = new PdfReaderContentParser(pdf);
            MyImageRenderListener  listener = new MyImageRenderListener();

            for (int i = 1; i <= pdf.NumberOfPages; i++)
            {
                parser.ProcessContent(i, listener); //Runs MyImageRenderListener below
            }
            var imgCount = listener.Images.Count;

            for (int i = 0; i < imgCount; i++)
            {
                var image = listener.Images[i];
                var ext   = listener.ImageExt[i];
                var name  = listener.ImageNames[i];

                using (MemoryStream ms = new MemoryStream(image))
                {
                    using (System.Drawing.Image img = System.Drawing.Image.FromStream(ms))
                    {
                        img.Save(String.Format("{0}\\{1}.png", outputPath, name), ImageFormat.Png);
                    }
                }
            }
        }
Example #2
0
        public IList <ImageData> ExtractImagesSizes(string pdfPath)
        {
            _imagesSizes = new List <ImageData>();

            using (var pdfReader = new PdfReader(pdfPath))
            {
                if (pdfReader.IsEncrypted())
                {
                    throw new ApplicationException(pdfPath + " is encrypted.");
                }

                var pdfParser = new PdfReaderContentParser(pdfReader);

                while (_currentPage <= pdfReader.NumberOfPages)
                {
                    pdfParser.ProcessContent(_currentPage, this);

                    _currentPage++;
                }
            }

            // we extracted them in reverse order previously
            _imagesSizes.Reverse();
            return(_imagesSizes);
        }
Example #3
0
        private bool GetText()
        {
            using (PdfReader reader = new PdfReader(path))
            {
                try
                {
                    StringBuilder text            = new StringBuilder();
                    var           imageCollection = new PDFImageCollection();
                    var           pdfParser       = new PdfReaderContentParser(reader);

                    for (int i = 1; i <= reader.NumberOfPages; i++)
                    {
                        text.Append(PdfTextExtractor.GetTextFromPage(reader, i));
                        pdfParser.ProcessContent(i, imageCollection);
                    }

                    tworker.WorkText(text);

                    return(true);
                }
                catch (Exception e)
                {
                    Trace.WriteLine(e.Message);
                    return(false);
                }
            }
        }
Example #4
0
        public string[] ExtractTextFromPDF(string path_to_pdf, int page_no)
        {
            using (PdfReader reader = new PdfReader(path_to_pdf))
            {
                PdfReaderContentParser parser = new PdfReaderContentParser(reader);

                SimpleTextExtractionStrategy strategy;

                List <string> word_list = new List <string>();
                string[]      words     = null;


                //for (int i = 1; i <= 1 /* reader.NumberOfPages */; i++)
                //{
                strategy = parser.ProcessContent(page_no, new SimpleTextExtractionStrategy());
                string[] lines = strategy.GetResultantText().Split('\n');

                foreach (string line in lines)
                {
                    Console.WriteLine("Line: " + line);
                    string[] next_words = line.Split(' ');

                    List <string> next_string = next_words.ToList();
                    word_list.AddRange(next_string);
                }

                words = word_list.ToArray();
                return(words);
                //}
            }
        }
        virtual public void TestWithMultiFilteredRenderListener()
        {
            PdfReader pdfReader           = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "test.pdf");
            PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);

            float x1, y1, x2, y2;

            MultiFilteredRenderListener listener = new MultiFilteredRenderListener();

            x1 = 122;
            x2 = 144;
            y1 = 841.9f - 151;
            y2 = 841.9f - 163;
            ITextExtractionStrategy region1Listener = listener.AttachRenderListener(
                new LocationTextExtractionStrategy(), new RegionTextRenderFilter(new Rectangle(x1, y1, x2, y2)));

            x1 = 156;
            x2 = 169;
            y1 = 841.9f - 151;
            y2 = 841.9f - 163;
            ITextExtractionStrategy region2Listener = listener.AttachRenderListener(
                new LocationTextExtractionStrategy(), new RegionTextRenderFilter(new Rectangle(x1, y1, x2, y2)));

            parser.ProcessContent(1, new GlyphRenderListener(listener));
            Assert.AreEqual("Your", region1Listener.GetResultantText());
            Assert.AreEqual("dju", region2Listener.GetResultantText());
        }
Example #6
0
        public IList <System.Drawing.Image> GetImage()
        {
            var imageList = new List <System.Drawing.Image>();

            try
            {
                var pdfReader     = new PdfReader(filePath);
                var contentParser = new PdfReaderContentParser(pdfReader);
                var imageListener = new MyImageRenderListener();
                for (int pageNumber = 1; pageNumber <= pdfReader.NumberOfPages; pageNumber++)
                {
                    contentParser.ProcessContent(pageNumber, imageListener);
                }
                for (int i = 0; i < imageListener.Images.Count; i++)
                {
                    using (MemoryStream ms = new MemoryStream(imageListener.Images[i]))
                    {
                        System.Drawing.Image imag = new Bitmap(ms);
                        imageList.Add(imag);
                    }
                }
                return(imageList);
            }
            catch (Exception ex)
            {
                throw ex;
            }
        }
        // Extracts all images (of types that iTextSharp knows how to decode) from a PDF file
        public static List <Image> ExtractImages(Stream myBlob, string filename, TraceWriter log)
        {
            var images = new List <Image>();

            try
            {
                using (var reader = new PdfReader(myBlob))
                {
                    var parser   = new PdfReaderContentParser(reader);
                    var listener = new ImageRenderListener(log);

                    for (var i = 1; i <= reader.NumberOfPages; i++)
                    {
                        parser.ProcessContent(i, listener);
                        if (listener.Images.Count > 0)
                        {
                            log.Verbose($"Found {listener.Images.Count} images on page {i}.");
                            images.AddRange(listener.Images);
                            listener.Images.Clear();
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                log.Error($"Error: {ex.Message}");
            }
            return(images);
        }
        /// <summary>
        /// Extract all images from a PDF file. Images will be extracted in their native format.
        /// </summary>
        /// <param name="pdfPath">Full path and file name of PDF file</param>
        /// <param name="outputFilePrefix">
        /// Basic name of exported files. If null then uses same name as PDF file.
        /// </param>
        /// <param name="outputPath">
        /// Images will be saved to this path. If null or empty then uses same folder as PDF file.
        /// </param>
        /// <param name="overwriteExistingFiles">
        /// True to overwrite existing image files, false to skip past them
        /// </param>
        /// <returns>
        /// Count of total number of images extracted.
        /// </returns>
        public static int ExtractImagesFromFile(
            string pdfPath,
            string outputFilePrefix,
            string outputPath,
            bool overwriteExistingFiles)
        {
            // Handle setting of default values
            outputFilePrefix = outputFilePrefix ?? System.IO.Path.GetFileNameWithoutExtension(pdfPath);
            outputPath       = String.IsNullOrEmpty(outputPath) ? System.IO.Path.GetDirectoryName(pdfPath) : outputPath;

            var extractor = new ImageExtractor(outputFilePrefix, outputPath, overwriteExistingFiles);

            using (var pdfReader = new PdfReader(pdfPath))
            {
                //Skip encrypted PDFs for now
                if (!pdfReader.IsEncrypted())
                {
                    var pdfParser = new PdfReaderContentParser(pdfReader);
                    while (extractor.CurrentPage <= pdfReader.NumberOfPages)
                    {
                        pdfParser.ProcessContent(extractor.CurrentPage, extractor);
                        extractor.CurrentPage++;
                    }
                }
            }

            return(extractor.ImageCount);
        }
Example #9
0
        /// <summary>Extracts all images (of types that iTextSharp knows how to decode) from a PDF file.</summary>
        public static Dictionary <string, System.Drawing.Image> ExtractImages(string filename)
        {
            var images = new Dictionary <string, System.Drawing.Image>();

            using (var reader = new PdfReader(filename))
            {
                var parser = new PdfReaderContentParser(reader);
                ImageRenderListener listener = null;
                for (var i = 1; i <= reader.NumberOfPages; i++)
                {
                    parser.ProcessContent(i, (listener = new ImageRenderListener()));
                    var index = 1;
                    if (listener.Images.Count > 0)
                    {
                        MessageBox.Show($"Found {listener.Images.Count} images on page { i}.");
                        foreach (var pair in listener.Images)
                        {
                            images.Add($"{System.IO.Path.GetFileNameWithoutExtension(filename)} _Page_{i.ToString("D4")} _Image_{index.ToString("D4")} {pair.Value}", pair.Key);
                            index++;
                        }
                    }
                }
                return(images);
            }
        }
Example #10
0
        /// <summary>Extracts all images (of types that iTextSharp knows how to decode) from a PDF file.</summary>
        public static Dictionary<string, System.Drawing.Image> ExtractImages(string filename)
        {
            var images = new Dictionary<string, System.Drawing.Image>();

            using (var reader = new PdfReader(filename))
            {
                var parser = new PdfReaderContentParser(reader);
                ImageRenderListener listener = null;

                for (var i = 1; i <= reader.NumberOfPages; i++)
                {
                    parser.ProcessContent(i, (listener = new ImageRenderListener()));
                    var index = 1;

                    if (listener.Images.Count > 0)
                    {
                        Console.WriteLine("Found {0} images on page {1}.", listener.Images.Count, i);

                        foreach (var pair in listener.Images)
                        {
                            images.Add(string.Format("{0}_Page_{1}_Image_{2}{3}",
                                System.IO.Path.GetFileNameWithoutExtension(filename), i.ToString("D4"), index.ToString("D4"), pair.Value), pair.Key);
                            index++;
                        }
                    }
                }
                return images;
            }
        }
Example #11
0
        public static IEnumerable <Image> ExtractImages(byte[] input)
        {
            var instance = new ImageExtractor();

            using (var reader = new PdfReader(input))
            {
                var parser = new PdfReaderContentParser(reader);
                while (instance._currentPage <= reader.NumberOfPages)
                {
                    parser.ProcessContent(instance._currentPage, instance);
                    instance._currentPage++;
                }
            }

            foreach (var item in ImagesAsByteArray)
            {
                var      ms  = new MemoryStream(item);
                FIBITMAP dib = FreeImage.LoadFromStream(ms);
                if (dib.IsNull)
                {
                    continue;
                }

                var bmp = FreeImage.GetBitmap(dib);
                Images.Add(bmp);
                FreeImage.UnloadEx(ref dib);
            }

            return(Images);
        }
Example #12
0
        public override void Process(PdfFileParserData data)
        {
            data.WFState.Value = WFState.WFStateFail;

            Dictionary <String, byte[]> files = new Dictionary <String, byte[]>();

            PdfReader reader = new PdfReader(data.DocumentToProcess);
            PdfReaderContentParser parser   = new PdfReaderContentParser(reader);
            MyImageRenderListener  listener = new MyImageRenderListener();

            for (int i = 1; i <= reader.NumberOfPages; i++)
            {
                parser.ProcessContent(i, listener);
            }
            for (int i = 0; i < listener.Images.Count; ++i)
            {
                string filedir = string.Format("{0}\\{1}", Path.GetDirectoryName(data.DocumentToProcess), WFUtilities.GetNextDirectoryNumber(Path.GetDirectoryName(data.DocumentToProcess)));
                if (!Directory.Exists(filedir))
                {
                    Directory.CreateDirectory(filedir);
                }
                if (Directory.Exists(filedir))
                {
                    using (FileStream fs = new FileStream(string.Format("{0}\\{1}", filedir, listener.ImageNames[i]), FileMode.Create, FileAccess.Write))
                    {
                        fs.Write(listener.Images[i], 0, listener.Images[i].Length);
                    }
                    data.OutputDocuments.Add(string.Format("{0}\\{1}", filedir, listener.ImageNames[i]));
                }
            }

            data.WFState.Value = KRSrcWorkflow.WFState.WFStateSuccess;
        }
Example #13
0
 public void TestInlineImageWithUnsupportedDecodeFilter()
 {
     PdfReader reader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "inlineImages01.pdf");
     PdfReaderContentParser parser = new PdfReaderContentParser(reader);
     int page = reader.NumberOfPages;
     LocationTextExtractionStrategy strategy = parser.ProcessContent(page, new LocationTextExtractionStrategy());
 }
Example #14
0
        /// <summary>
        /// Extracts all images (of types that iTextSharp knows how to decode) from a PDF file.
        /// </summary>
        /// <param name="fileName">Full path to pdf file</param>
        /// <returns></returns>
        public static List <ImageInfo> ExtractImagesInfo(string fileName)
        {
            List <ImageInfo> eiiList = new List <ImageInfo>();

            PdfReader.unethicalreading = true;

            using (var reader = new PdfReader(fileName))
            {
                var parser = new PdfReaderContentParser(reader);
                ImageRenderListener listener = null;

                for (var i = 1; i <= reader.NumberOfPages; i++)
                {
                    parser.ProcessContent(i, (listener = new ImageRenderListener()));
                    if (listener.ImagesInfo.Count > 0)
                    {
                        foreach (ImageInfo imageInfo in listener.ImagesInfo)
                        {
                            imageInfo.pageNum = i;
                            eiiList.Add(imageInfo);
                        }
                    }
                }
                return(eiiList);
            }
        }
Example #15
0
        /// <summary>Extracts all images (of types that iTextSharp knows how to decode) from a PDF file.</summary>
        public static Dictionary <string, System.Drawing.Image> ExtractImages(string filename)
        {
            var images = new Dictionary <string, System.Drawing.Image>();

            using (var reader = new PdfReader(filename))
            {
                var parser = new PdfReaderContentParser(reader);
                ImageRenderListener listener = null;

                for (var i = 1; i <= reader.NumberOfPages; i++)
                {
                    parser.ProcessContent(i, (listener = new ImageRenderListener()));
                    var index = 1;

                    if (listener.Images.Count > 0)
                    {
                        Console.WriteLine("Found {0} images on page {1}.", listener.Images.Count, i);

                        foreach (var pair in listener.Images)
                        {
                            images.Add(string.Format("{0}_Page_{1}_Image_{2}{3}",
                                                     System.IO.Path.GetFileNameWithoutExtension(filename), i.ToString("D4"), index.ToString("D4"), pair.Value), pair.Key);
                            index++;
                        }
                    }
                }
                return(images);
            }
        }
Example #16
0
        public bool ExtractImages(string pdfPath, string outputFolder, ProgressBar prBar)
        {
            _outputFolder = outputFolder;
            _imgname      = System.IO.Path.GetFileNameWithoutExtension(pdfPath);
            try{
                using (PdfReader pdfReader = new PdfReader(pdfPath))
                {
                    if (pdfReader.IsEncrypted())
                    {
                        throw new ApplicationException(pdfPath + " is encrypted.");
                    }

                    PdfReaderContentParser pdfParser = new PdfReaderContentParser(pdfReader);

                    prBar.Maximum = pdfReader.NumberOfPages;
                    prBar.Minimum = 1;
                    prBar.Visible = true;

                    while (_currentPage <= pdfReader.NumberOfPages)
                    {
                        _imgNo      = 1;
                        prBar.Value = _currentPage;
                        Application.DoEvents();
                        _pageRotation = pdfReader.GetPageRotation(_currentPage);
                        pdfParser.ProcessContent(_currentPage, this);
                        _currentPage++;
                    }
                }
                prBar.Visible = false;
                return(true);
            }catch (Exception ee) {
                throw new Exception("PDF ni resimge aylandurushtiki xataliq", ee);
            }
        }
Example #17
0
// ---------------------------------------------------------------------------
        public void Write(Stream stream)
        {
            using (ZipFile zip = new ZipFile()) {
                zip.AddFile(PREFACE, "");
                PdfReader reader = new PdfReader(PREFACE);
                PdfReaderContentParser parser = new PdfReaderContentParser(reader);
                using (MemoryStream ms = new MemoryStream()) {
                    using (PdfStamper stamper = new PdfStamper(reader, ms)) {
                        TextMarginFinder finder;
                        for (int i = 1; i <= reader.NumberOfPages; i++)
                        {
                            finder = parser.ProcessContent(i, new TextMarginFinder());
                            PdfContentByte cb = stamper.GetOverContent(i);
                            cb.Rectangle(
                                finder.GetLlx(), finder.GetLly(),
                                finder.GetWidth(), finder.GetHeight()
                                );
                            cb.Stroke();
                        }
                    }
                    zip.AddEntry(RESULT, ms.ToArray());
                }
                zip.Save(stream);
            }
        }
 /// <summary>
 /// Parses a page of a PDF file resulting in a list of
 /// </summary>
 /// <param name="reader">a PdfReader</param>
 /// <param name="page">the page number of the page that needs to be parsed</param>
 /// <param name="header_height">the height of the top margin</param>
 /// <returns>a list of TextItem and ImageItem objects</returns>
 public List<MyItem> GetContentItems(PdfReader reader, int page, float header_height)
 {
     PdfReaderContentParser parser = new PdfReaderContentParser(reader);
     MyRenderListenerSimple myRenderListener = new MyRenderListenerSimple();
     parser.ProcessContent(page, myRenderListener);
     return myRenderListener.Items;
 }
        private static Dictionary <string, System.Drawing.Image> ExtractImages(byte[] fileBytes)
        {
            var images = new Dictionary <string, System.Drawing.Image>();

            using (var reader = new PdfReader(fileBytes))
            {
                var parser = new PdfReaderContentParser(reader);

                for (var i = 1; i <= reader.NumberOfPages; i++)
                {
                    ImageRenderListener listener = new ImageRenderListener();
                    parser.ProcessContent(i, listener);
                    var index = 1;

                    if (listener.Images.Count > 0)
                    {
                        foreach (var pair in listener.Images)
                        {
                            images.Add(string.Format("Page_{0}_Image_{1}{2}", i.ToString("D4"), index.ToString("D4"), pair.Value), pair.Key);
                            index++;
                        }
                    }
                }
                return(images);
            }
        }
Example #20
0
        public Bitmap DrawContours(int page, string pathToPdf)
        {
            using (var reader = new PdfReader(pathToPdf))
            {
                var strategy = new TableExtractionStrategy();
                var parser   = new PdfReaderContentParser(reader);
                parser.ProcessContent(page, strategy);
                var size = reader.GetPageSize(page);
                var bmp  = new Bitmap((int)size.Width, (int)size.Height);
                var h    = bmp.Height;
                using (var gp = Graphics.FromImage(bmp))
                {
                    gp.Clear(Color.White);
                    strategy.GetAllInLines().ForEach(l =>
                    {
                        gp.DrawLine(Pens.Black, l.GetStartPoint()[0], h - l.GetStartPoint()[1]
                                    , l.GetEndPoint()[0], h - l.GetEndPoint()[1]);
                    });
                    strategy.GetAllInPoints().ForEach(p =>
                    {
                        gp.DrawEllipse(Pens.Black, p.X - 2, h - p.Y - 2, 4, 4);
                    });
                }

                return(bmp);
            }
        }
Example #21
0
        private void ParseAndHighlight(String input, String output, bool singleCharacters)
        {
            PdfReader  reader  = new PdfReader(input);
            FileStream fos     = new FileStream(output, FileMode.Create);
            PdfStamper stamper = new PdfStamper(reader, fos);

            PdfReaderContentParser parser           = new PdfReaderContentParser(reader);
            MyRenderListener       myRenderListener = singleCharacters ? new MyCharacterRenderListener() : new MyRenderListener();

            for (int pageNum = 1; pageNum <= reader.NumberOfPages; pageNum++)
            {
                List <Rectangle> rectangles = parser.ProcessContent(pageNum, myRenderListener).GetRectangles();
                PdfContentByte   canvas     = stamper.GetOverContent(pageNum);
                canvas.SetLineWidth(0.5f);
                canvas.SetColorStroke(BaseColor.RED);
                foreach (Rectangle rectangle in rectangles)
                {
                    canvas.Rectangle(rectangle.Left, rectangle.Bottom, rectangle.Width, rectangle.Height);
                    canvas.Stroke();
                }
            }
            stamper.Close();
            fos.Close();
            reader.Close();
        }
        public static IEnumerable <string> ConvertPDFToImage(PdfReader reader, string diretorioSaida, int quantidadePaginas)
        {
            PdfReaderContentParser parser          = new PdfReaderContentParser(reader);
            ImageRenderListener    listener        = new ImageRenderListener();
            List <string>          listaDeCaminhos = new List <string>();

            try
            {
                for (int i = 1; i <= quantidadePaginas; i++)
                {
                    parser.ProcessContent(i, listener);
                }

                for (int i = 0; i < listener.Images.Count; ++i)
                {
                    listaDeCaminhos.Add(ConvertJpgToPng(i, listener, diretorioSaida));
                }
                return(listaDeCaminhos);
            }
            finally
            {
                GC.SuppressFinalize(listener);
                GC.SuppressFinalize(parser);
                GC.SuppressFinalize(reader);
                GC.SuppressFinalize(listaDeCaminhos);
                reader.Dispose();
            }
        }
Example #23
0
        /// <summary>
        /// Extracts a text from a PDF file.
        /// </summary>
        /// <param name="fileName">The full path to the pdf file.</param>
        /// <param name="success">Indicate if operation was successfull.</param>
        /// <returns>The extracted text.</returns>
        internal static String ExtractText(String fileName, out bool success)
        {
            String result = String.Empty;
            PdfReader reader = null;
            success = false;

            try
            {
                reader = new PdfReader(fileName);
                PdfReaderContentParser parser = new PdfReaderContentParser(reader);

                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    SimpleTextExtractionStrategy strategy = parser.ProcessContent(page, new SimpleTextExtractionStrategy());
                    result += strategy.GetResultantText();
                }

                success = true;
                return result;
            }
            catch (Exception)
            {
                return String.Empty;
            }
            finally
            {
                if (reader != null)
                {
                    reader.Close();
                }
            }
        }
Example #24
0
        /// <summary>
        /// Extract all images from a PDF file
        /// </summary>
        /// <param name="pdfPath">Full path and file name of PDF file</param>
        /// <param name="outputFilePrefix">Basic name of exported files. If null then uses same name as PDF file.</param>
        /// <param name="outputFolder">Where to save images. If null or empty then uses same folder as PDF file.</param>
        /// <param name="overwriteExistingFiles">True to overwrite existing image files, false to skip past them</param>
        /// <returns>Count of number of images extracted.</returns>
        public static int ExtractImagesFromFile(string pdfPath, string outputFilePrefix, string outputFolder, bool overwriteExistingFiles)
        {
            // Handle setting of any default values
            outputFilePrefix = outputFilePrefix ?? System.IO.Path.GetFileNameWithoutExtension(pdfPath);
            outputFolder     = String.IsNullOrEmpty(outputFolder) ? System.IO.Path.GetDirectoryName(pdfPath) : outputFolder;

            var instance = new ImageExtractor(outputFilePrefix, outputFolder, overwriteExistingFiles);

            using (var pdfReader = new PdfReader(pdfPath)) {
                if (pdfReader.IsEncrypted())
                {
                    throw new ApplicationException(pdfPath + " is encrypted.");
                }

                var pdfParser = new PdfReaderContentParser(pdfReader);

                while (instance._currentPage <= pdfReader.NumberOfPages)
                {
                    pdfParser.ProcessContent(instance._currentPage, instance);

                    instance._currentPage++;
                }
            }

            return(instance._imageCount);
        }
Example #25
0
        //Todo create one public method that will choose which method to use.  Try Itext, and if fails fall back to pdfium

        public List <byte[]> CovertPdfToImages(string filepath)
        {
            List <byte[]> images = new List <byte[]>();

            using (var reader = new PdfReader(filepath))
            {
                var parser = new PdfReaderContentParser(reader);
                ImageRenderListener listener = null;

                for (var i = 1; i <= reader.NumberOfPages; i++)
                {
                    parser.ProcessContent(i, (listener = new ImageRenderListener()));

                    if (listener.Images.Count > 0)
                    {
                        var imgs = listener.Images.Select(pair => pair.Key);

                        foreach (var img in imgs)
                        {
                            var data = img.Encode(SKEncodedImageFormat.Jpeg, 100);
                            var ms   = new MemoryStream();
                            data.SaveTo(ms);
                            images.Add(ms.ToArray());
                            ms.Dispose();
                        }
                    }
                }
            }
            return(images);
        }
        virtual public void TestWithMultiFilteredRenderListener() {
            PdfReader pdfReader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "test.pdf");
            PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);

            float x1, y1, x2, y2;

            MultiFilteredRenderListener listener = new MultiFilteredRenderListener();
            x1 = 122;
            x2 = 144;
            y1 = 841.9f - 151;
            y2 = 841.9f - 163;
            ITextExtractionStrategy region1Listener = listener.AttachRenderListener(
                new LocationTextExtractionStrategy(), new RegionTextRenderFilter(new Rectangle(x1, y1, x2, y2)));

            x1 = 156;
            x2 = 169;
            y1 = 841.9f - 151;
            y2 = 841.9f - 163;
            ITextExtractionStrategy region2Listener = listener.AttachRenderListener(
                new LocationTextExtractionStrategy(), new RegionTextRenderFilter(new Rectangle(x1, y1, x2, y2)));

            parser.ProcessContent(1, new GlyphRenderListener(listener));
            Assert.AreEqual("Your", region1Listener.GetResultantText());
            Assert.AreEqual("dju", region2Listener.GetResultantText());
        }
Example #27
0
        public static List <PDFPage> GetPDFPages(Stream pdfStream, TraceWriter log, bool ocrImages = false)
        {
            var result = new List <PDFPage>();

            pdfStream.Position = 0; // Ensure that we are at the start

            // Note: PdfReader Dispose closes the stream...
            using (PdfReader reader = new PdfReader(pdfStream))
            {
                var numberOfPages = reader.NumberOfPages;

                var parser = new PdfReaderContentParser(reader);
                ImageRenderListener listener = null; // = new ImageRenderListener(log);

                for (var i = 1; i <= reader.NumberOfPages; i++)
                {
                    var page = new PDFPage {
                        Number = i
                    };
                    try
                    {
                        parser.ProcessContent(i, (listener = new ImageRenderListener(log)));
                    }
                    catch (Exception ex)
                    {
                        log.Error(string.Format("Page {0} Image Processing Exception", i), ex);
                    }

                    if (listener.Images.Count > 0)
                    {
                        log.Info(string.Format("Found {0} images on page {1}.", listener.Images.Count, i));
                        page.ExtractedImages = listener.Images;
                        if (ocrImages)
                        {
                            if (listener.Images.Count < 10)
                            {
                                log.Info("Calling Vision API to OCR Page Images");
                                VisionAPIHelper.OCRPage(page, log);
                            }
                            else
                            {
                                log.Info("Too many Page Images for Vision API");
                            }
                        }
                    }
                    try
                    {
                        page.PageText = PdfTextExtractor.GetTextFromPage(reader, i, new SimpleTextExtractionStrategy());
                    }
                    catch (System.ArgumentException ex)
                    {
                        log.Error(string.Format("Page {0} Text Processing Exception", i), ex);
                    }

                    result.Add(page);
                }
            }
            return(result);
        }
Example #28
0
        public static PdfExtractionResult PDF_ExportImage(string filename, string dirForExtractions, int divider, bool checkResult, bool joinImages)
        {
            var details = new PdfExtractionResult();

            DataAccess.Instance.g_curProgress = 0;
            evnt_UpdateCurBar();

            var imagesList = new Dictionary<PageImageIndex, Image>();          

            // Ask itextsharp to extract image
            var pdfReader = new PdfReader(filename);
            var pdfParser = new PdfReaderContentParser(pdfReader);
            var pdfListener = new PDFImageListener(dirForExtractions);

            double tem0 = divider;
            double pgc = pdfReader.NumberOfPages;
            double CurOneStep = (double)(tem0 / pgc);

            details.Pages = (int)pgc;

            for (int i = 1; i <= pgc; i++)
            {
                pdfListener.PageIndex = i;
                // itextsharp send response to listener
                pdfParser.ProcessContent(i, pdfListener);

                DataAccess.Instance.g_curProgress += CurOneStep;
                evnt_UpdateCurBar();
            }

            imagesList = pdfListener.ImagesList; 
            details.ImagesBeforeMerge = pdfListener.ImagesList.Count;
            details.ImagesAfterMerge = details.ImagesBeforeMerge;

            if (checkResult && pdfReader.NumberOfPages != details.ImagesBeforeMerge)
            {
                if (joinImages)
                {
                    ImageJoiner cp = new ImageJoiner();
                    imagesList = cp.Merge(pdfListener.ImagesList, dirForExtractions);
                }

                details.ImagesAfterMerge = imagesList.Count;

                if(pdfReader.NumberOfPages != imagesList.Count)
                {                    
                    //Directory.Delete(dirForExtractions, true);
                    //throw new Exception(string.Format("Error extracting {0} : {1} images for {2} pages", Path.GetFileName(filename), pdfListener.ImagesList.Count, pdfReader.NumberOfPages));
                }
            }

            if (pdfReader != null)
                pdfReader.Close();

            // Write images to disk (because of memory problem write directly to file now)
            //WriteImages(dirForExtractions, imagesList);

            return details;
        }
 /// <summary>
 /// Parses a page of a PDF file resulting in a list of
 /// TextItem and ImageItem objects.
 /// </summary>
 /// <param name="reader">a PdfReader</param>
 /// <param name="page">the page number of the page that needs to be parsed</param>
 /// <param name="header_height">header_height the height of the top margin</param>
 /// <returns>a list of TextItem and ImageItem objects</returns>
 public List<MyItem> GetContentItems(PdfReader reader, int page, float header_height)
 {
     PdfReaderContentParser parser = new PdfReaderContentParser(reader);
     Rectangle pageSize = reader.GetPageSize(page);
     MyRenderListener myRenderListener = new MyRenderListener(pageSize.Top - header_height);
     parser.ProcessContent(page, myRenderListener);
     return myRenderListener.Items;
 }
Example #30
0
        public static Dictionary <string, int> PageContainsImages(PdfReader reader, int pageNumber)
        {
            var listener = new ImageCheckListener();
            var parser   = new PdfReaderContentParser(reader);

            parser.ProcessContent(pageNumber, listener);
            return(listener.Images);
        }
Example #31
0
        /// <summary>
        /// Parses a page of a PDF file resulting in a list of
        /// </summary>
        /// <param name="reader">a PdfReader</param>
        /// <param name="page">the page number of the page that needs to be parsed</param>
        /// <param name="header_height">the height of the top margin</param>
        /// <returns>a list of TextItem and ImageItem objects</returns>
        public List <MyItem> GetContentItems(PdfReader reader, int page, float header_height)
        {
            PdfReaderContentParser parser           = new PdfReaderContentParser(reader);
            MyRenderListenerSimple myRenderListener = new MyRenderListenerSimple();

            parser.ProcessContent(page, myRenderListener);
            return(myRenderListener.Items);
        }
        public virtual void TestCharacterRenderInfos() {
            byte[] bytes = CreateSimplePdf(PageSize.LETTER.Rotate().Rotate(), "ABCD");
            //TestResourceUtils.saveBytesToFile(bytes, new File("C:/temp/out.pdf"));

            PdfReader r = new PdfReader(bytes);

            PdfReaderContentParser parser = new PdfReaderContentParser(r);
            parser.ProcessContent(FIRST_PAGE, new CharacterPositionRenderListener());
        }
Example #33
0
        /// <summary>
        /// Parses a page of a PDF file resulting in a list of
        /// TextItem and ImageItem objects.
        /// </summary>
        /// <param name="reader">a PdfReader</param>
        /// <param name="page">the page number of the page that needs to be parsed</param>
        /// <param name="header_height">header_height the height of the top margin</param>
        /// <returns>a list of TextItem and ImageItem objects</returns>
        public List <MyItem> GetContentItems(PdfReader reader, int page, float header_height)
        {
            PdfReaderContentParser parser     = new PdfReaderContentParser(reader);
            Rectangle        pageSize         = reader.GetPageSize(page);
            MyRenderListener myRenderListener = new MyRenderListener(pageSize.Top - header_height);

            parser.ProcessContent(page, myRenderListener);
            return(myRenderListener.Items);
        }
Example #34
0
        /// <summary>
        /// Extracts all images from a specified page of a PDF file.
        /// </summary>
        /// <param name="pdf">The PDF stream.</param>
        /// <param name="password">The password used to protect the document.</param>
        /// <returns>Returns an array of images
        /// where the key is a suggested file name, in the format: PDF filename without extension,
        /// page number and image index in the page.</returns>
        public Dictionary <string, System.Drawing.Image> ExtractImages(Stream pdf, string password = "")
        {
            byte[] pass = null;
            iTextSharp.text.pdf.PdfReader pdfReader = null;
            var images = new Dictionary <string, System.Drawing.Image>();

            try
            {
                // If no password.
                if (String.IsNullOrEmpty(password))
                {
                    pdfReader = new iTextSharp.text.pdf.PdfReader(pdf);
                }
                else
                {
                    pass      = Encoding.Default.GetBytes(password);
                    pdfReader = new iTextSharp.text.pdf.PdfReader(pdf, pass);
                }

                // Create the pdf parser.
                var parser = new PdfReaderContentParser(pdfReader);
                ImageRenderListener listener = null;

                for (var i = 1; i <= pdfReader.NumberOfPages; i++)
                {
                    // Parse the pdf stream.
                    parser.ProcessContent(i, (listener = new ImageRenderListener()));
                    var index = 1;

                    // If images exist.
                    if (listener.Images.Count > 0)
                    {
                        // For each image extracted.
                        foreach (var pair in listener.Images)
                        {
                            // Add the image.
                            images.Add(string.Format("Page_{ 1} Image_{ 2} { 3}", i.ToString("D4"), index.ToString("D4"), pair.Value), pair.Key);
                            index++;
                        }
                    }
                }
            }
            catch (Exception)
            {
                throw;
            }
            finally
            {
                if (pdfReader != null)
                {
                    pdfReader.Close();
                }
            }

            // Return the images.
            return(images);
        }
        public virtual void TestCharacterRenderInfos()
        {
            byte[] bytes = CreateSimplePdf(PageSize.LETTER.Rotate().Rotate(), "ABCD");
            //TestResourceUtils.saveBytesToFile(bytes, new File("C:/temp/out.pdf"));

            PdfReader r = new PdfReader(bytes);

            PdfReaderContentParser parser = new PdfReaderContentParser(r);
            parser.ProcessContent(1, new CharacterPositionRenderListener());
        }
Example #36
0
 /// <summary>Checks whether a specified page of a PDF file contains images.</summary>
 /// <returns>True if the page contains at least one image; false otherwise.</returns>
 public static bool PageContainsImages(string filename, int pageNumber)
 {
     using (var reader = new PdfReader(filename))
     {
         var parser = new PdfReaderContentParser(reader);
         ImageRenderListener listener = null;
         parser.ProcessContent(pageNumber, (listener = new ImageRenderListener()));
         return listener.Images.Count > 0;
     }
 }
Example #37
0
 /// <summary>Checks whether a specified page of a PDF file contains images.</summary>
 /// <returns>True if the page contains at least one image; false otherwise.</returns>
 public static bool PageContainsImages(string filename, int pageNumber)
 {
     using (var reader = new PdfReader(filename))
     {
         var parser = new PdfReaderContentParser(reader);
         ImageRenderListener listener = null;
         parser.ProcessContent(pageNumber, (listener = new ImageRenderListener()));
         return(listener.Images.Count > 0);
     }
 }
        virtual public void Test2() {
            PdfReader pdfReader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "Sample.pdf");

            PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);
            String extractedText =
                parser.ProcessContent(1,
                    new GlyphTextRenderListener(new FilteredTextRenderListener(new LocationTextExtractionStrategy(),
                        new RegionTextRenderFilter(new Rectangle(111, 855, 136, 867))))).GetResultantText();

            Assert.AreEqual("Your ", extractedText);
        }
Example #39
0
// --------------------------------------------------------------------------- 
    public void Write(Stream stream) {
      using (ZipFile zip = new ZipFile()) {
        zip.AddFile(PREFACE, "");
        PdfReader reader = new PdfReader(PREFACE);
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        StringBuilder sb = new StringBuilder();
        ITextExtractionStrategy strategy;
        for (int i = 1; i <= reader.NumberOfPages; i++) {
          strategy = parser.ProcessContent(i, new LocationTextExtractionStrategy());
          sb.AppendLine(strategy.GetResultantText());
        }
        zip.AddEntry(RESULT, sb.ToString());
        zip.Save(stream);             
      }
    }
        virtual public void Test1() {
            PdfReader pdfReader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "test.pdf");
            PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);

            float x1, y1, x2, y2;

            x1 = 203;
            x2 = 224;
            y1 = 842 - 44;
            y2 = 842 - 93;
            String extractedText =
                parser.ProcessContent(1,
                    new GlyphTextRenderListener(new FilteredTextRenderListener(new LocationTextExtractionStrategy(),
                        new RegionTextRenderFilter(new Rectangle(x1, y1, x2, y2))))).GetResultantText();
            Assert.AreEqual("1234\nt5678", extractedText);
        }
Example #41
0
// ===========================================================================
    public void Write(Stream stream) {
      ImageTypes it = new ImageTypes();
      using (ZipFile zip = new ZipFile()) {
        byte[] pdf = it.CreatePdf();
        zip.AddEntry(Utility.ResultFileName(it.ToString() + ".pdf"), pdf);
        PdfReader reader = new PdfReader(pdf);
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        MyImageRenderListener listener = new MyImageRenderListener();
        for (int i = 1; i <= reader.NumberOfPages; i++) {
          parser.ProcessContent(i, listener);
        } 
        for (int i = 0; i < listener.MyImages.Count; ++i) {
          zip.AddEntry(
            listener.ImageNames[i],
            listener.MyImages[i]
          );
        }         
        zip.Save(stream);
      }
    }
        private void ParseAndHighlight(String input, String output, bool singleCharacters) {
            PdfReader reader = new PdfReader(input);
            FileStream fos = new FileStream(output, FileMode.Create);
            PdfStamper stamper = new PdfStamper(reader, fos);

            PdfReaderContentParser parser = new PdfReaderContentParser(reader);
            MyRenderListener myRenderListener = singleCharacters ? new MyCharacterRenderListener() : new MyRenderListener();
            for (int pageNum = 1; pageNum <= reader.NumberOfPages; pageNum++) {
                List<Rectangle> rectangles = parser.ProcessContent(pageNum, myRenderListener).GetRectangles();
                PdfContentByte canvas = stamper.GetOverContent(pageNum);
                canvas.SetLineWidth(0.5f);
                canvas.SetColorStroke(BaseColor.RED);
                foreach (Rectangle rectangle in rectangles) {
                    canvas.Rectangle(rectangle.Left, rectangle.Bottom, rectangle.Width, rectangle.Height);
                    canvas.Stroke();
                }
            }
            stamper.Close();
            fos.Close();
            reader.Close();
        }
Example #43
0
// --------------------------------------------------------------------------- 
    public void Write(Stream stream) {
      using (ZipFile zip = new ZipFile()) {
        zip.AddFile(PREFACE, "");
        PdfReader reader = new PdfReader(PREFACE);
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        using (MemoryStream ms = new MemoryStream()) {
          using (PdfStamper stamper = new PdfStamper(reader, ms)) {
            TextMarginFinder finder;
            for (int i = 1; i <= reader.NumberOfPages; i++) {
              finder = parser.ProcessContent(i, new TextMarginFinder());
              PdfContentByte cb = stamper.GetOverContent(i);
              cb.Rectangle(
                finder.GetLlx(), finder.GetLly(),
                finder.GetWidth(), finder.GetHeight()
              );
              cb.Stroke();
            }
          }
          zip.AddEntry(RESULT, ms.ToArray());
        }
        zip.Save(stream);             
      }
    }
 public void TestInlineImageWithUnsupportedDecodeFilter() {
     PdfReader reader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "inlineImages01.pdf");
     PdfReaderContentParser parser = new PdfReaderContentParser(reader);
     int page =  reader.NumberOfPages;
     LocationTextExtractionStrategy strategy = parser.ProcessContent(page, new LocationTextExtractionStrategy());
 }
Example #45
0
        /// <summary>Extracts all images (of types that iTextSharp knows how to decode) 
        /// from a specified page of a PDF file.</summary>
        /// <returns>Returns a generic <see cref="Dictionary&lt;string, System.Drawing.Image&gt;"/>, 
        /// where the key is a suggested file name, in the format: PDF filename without extension, 
        /// page number and image index in the page.</returns>
        public static Dictionary<string, System.Drawing.Image> ExtractImages(string filename, int pageNumber)
        {
            Dictionary<string, System.Drawing.Image> images = new Dictionary<string, System.Drawing.Image>();
            PdfReader reader = new PdfReader(filename);
            PdfReaderContentParser parser = new PdfReaderContentParser(reader);
            ImageRenderListener listener = null;

            parser.ProcessContent(pageNumber, (listener = new ImageRenderListener()));
            int index = 1;

            if (listener.Images.Count > 0)
            {
                Console.WriteLine("Found {0} images on page {1}.", listener.Images.Count, pageNumber);

                foreach (KeyValuePair<System.Drawing.Image, string> pair in listener.Images)
                {
                    images.Add(string.Format("{0}_Page_{1}_Image_{2}{3}",
                        System.IO.Path.GetFileNameWithoutExtension(filename), pageNumber.ToString("D4"), index.ToString("D4"), pair.Value), pair.Key);
                    index++;
                }
            }
            return images;
        }
Example #46
0
        private void ExtractPDFImages(object inputFileObject)
        {
            string inputFile = inputFileObject as string;

            PdfReader reader = new PdfReader(inputFile);
            PdfReaderContentParser parser = new PdfReaderContentParser(reader);
            MyImageRenderListener listener = new MyImageRenderListener(outputFolderLabel.Text, this.statusListView, this);


            for (int i = 1; i <= reader.NumberOfPages; i++)
            {
                //string outputStatus = string.Format("Extracting images from page {0:0000}", i);
                //this.statusListView.Invoke(new AddStatusItemDelegate(AddStatusItem), new object[] { outputStatus, string.Empty });
                parser.ProcessContent(i, listener);

                int progress = (int)Math.Ceiling((float)i / (float)reader.NumberOfPages * 100f);
                progressBarBook.Invoke(new SetInt(SetBookProgress), progress);
            }
        }
Example #47
0
 /**
  * Extract text from a specified page using an extraction strategy.
  * @param reader the reader to extract text from
  * @param pageNumber the page to extract text from
  * @param strategy the strategy to use for extracting text
  * @return the extracted text
  * @throws IOException if any operation fails while reading from the provided PdfReader
  * @since 5.0.2
  */
 public static String GetTextFromPage(PdfReader reader, int pageNumber, ITextExtractionStrategy strategy)
 {
     PdfReaderContentParser parser = new PdfReaderContentParser(reader);
     return parser.ProcessContent(pageNumber, strategy).GetResultantText();
 }
        public Dictionary<System.Drawing.Image, string> Extraer_Imagenes()
        {
            Dictionary<System.Drawing.Image, string> listaImagenes = new Dictionary<System.Drawing.Image, string>();

            if(miPDF != null)
            {
                PdfReaderContentParser chekeadorPDF = new PdfReaderContentParser(miPDF.PdfLeido);
                ImageRenderListener validadorImagenes = null;

                for (byte i = 1; i <= miPDF.PdfLeido.NumberOfPages; i++)
                {
                    chekeadorPDF.ProcessContent(i, (validadorImagenes = new ImageRenderListener()));

                    if (validadorImagenes.Imagenes.Count > 0)
                    {
                        foreach (var imagenSeleccionada in validadorImagenes.Imagenes)
                        {
                            listaImagenes.Add(imagenSeleccionada.Key, imagenSeleccionada.Value);

                            //Lanzamos el Evento de Nueva Imagen Leida
                            GetImagesPdfEventArgs NewImageEvent = new GetImagesPdfEventArgs(i);
                            NewImageRead(this, NewImageEvent);
                        }
                    }
                }

                return listaImagenes;
            }

            return null;
        }
Example #49
0
        private void openFile(string fileName)
        {
            string[] fileSplit = fileName.Split('.');

            if (fileSplit[fileSplit.Length - 1] == "docx")
            {
                DocxToText dtt = new DocxToText(fileName);
                richTextBoxEditor.Text = dtt.ExtractText();
            }
            else if (fileSplit[fileSplit.Length - 1] == "doc")
            {
                MessageBox.Show("Tyvärr stödjer inte programmet det gamla wordformatet (.doc). Prova med att spara om det till det nya formatet (.docx), eller som en textfil", "Fel filformat", MessageBoxButtons.OK, MessageBoxIcon.Stop);
            }
            else if(fileSplit[fileSplit.Length - 1] == "rtf")
            {
                richTextBoxEditor.LoadFile(fileName, RichTextBoxStreamType.RichText);
            }
            else if (fileSplit[fileSplit.Length - 1] == "pdf")
            {
                richTextBoxEditor.Clear();

                PdfReader pdfread = new PdfReader(fileName);
                PdfReaderContentParser pdfparser = new PdfReaderContentParser(pdfread);
                ITextExtractionStrategy strategy;

                for (int i = 1; i <= pdfread.NumberOfPages; i++)
                {
                    strategy = pdfparser.ProcessContent(i, new SimpleTextExtractionStrategy());
                    richTextBoxEditor.Text += strategy.GetResultantText();
                }
            }
            else if (fileSplit[fileSplit.Length - 1] == "txt")
            {
                richTextBoxEditor.LoadFile(fileName, RichTextBoxStreamType.PlainText);
            }
            else
            {
                richTextBoxEditor.LoadFile(fileName, RichTextBoxStreamType.PlainText);
            }

            textToolStripMenuItem.Enabled = false;
            punktToolStripMenuItem.Enabled = false;
            textToolStripMenuItem.Checked = false;
            punktToolStripMenuItem.Checked = false;
        }