Exemple #1
0
        public string PdfToText()
        {
            string    pdfText = String.Empty;
            PDFParser parser  = new PDFParser(new BufferedInputStream(new FileInputStream(PdfFile)));

            parser.parse();
            PDDocument originialPdfDoc = parser.getPDDocument();

            bool isOriginalDocEncrypted = originialPdfDoc.isEncrypted();

            if (isOriginalDocEncrypted)
            {
                originialPdfDoc.openProtection(new StandardDecryptionMaterial(PdfPassword));
            }
            PDFTextStripper stripper = new PDFTextStripper();

            try
            {
                pdfText = stripper.getText(originialPdfDoc);
            }
            catch (java.io.IOException ex)
            {
                throw ex;
            }
            return(pdfText);
        }
Exemple #2
0
        /**
         * 提取部分页面文本
         * @param file pdf文档路径
         * @param startPage 开始页数
         * @param endPage 结束页数
         */
        public static string ExtractTXT(String file, int startPage, int endPage)
        {
            String content = string.Empty;

            try
            {
                PDDocument document = PDDocument.load(file);
                //获取一个PDFTextStripper文本剥离对象
                PDFTextStripper stripper = new PDFTextStripper();

                // 设置按顺序输出
                stripper.setSortByPosition(true);

                // 设置起始页
                stripper.setStartPage(startPage);
                // 设置结束页
                stripper.setEndPage(endPage);
                //获取文本内容
                content = stripper.getText(document);
                document.close();
            }
            catch (java.io.FileNotFoundException ex)
            {
            }
            catch (java.io.IOException ex)
            {
            }
            return(content);
        }
        public static PDFInfo GetPDFDoucmentInformation(PDDocument document)
        {
            PDFInfo i = new PDFInfo();
            PDDocumentInformation info = document.getDocumentInformation();
            i.Author = info.getAuthor();

            if (info.getCreationDate() != null)
            {
                DateTime dt = Utilities.Utils.GetDateFromJava(info.getCreationDate());
                i.CreationDate = dt.ToLongDateString() + " " + dt.ToLongTimeString();
            }

            i.Creator = info.getCreator();
            i.Keywords = info.getKeywords();

            if (info.getModificationDate() != null)
            {
                DateTime dt = Utilities.Utils.GetDateFromJava(info.getModificationDate());
                i.ModificationDate = dt.ToLongDateString() + " " + dt.ToLongTimeString();
            }

            i.Producer = info.getProducer();
            i.Subject = info.getSubject();
            i.Title = info.getTitle();
            i.Trapped = info.getTrapped();
            i.NumberOfPages = document.getNumberOfPages();
            return i;
        }
Exemple #4
0
        public string PdfFields()
        {
            string    pdfText = String.Empty;
            PDFParser parser  = new PDFParser(new BufferedInputStream(new FileInputStream(PdfFile)));

            parser.parse();
            PDDocument originialPdfDoc = parser.getPDDocument();

            bool isOriginalDocEncrypted = originialPdfDoc.isEncrypted();

            if (isOriginalDocEncrypted)
            {
                originialPdfDoc.openProtection(new StandardDecryptionMaterial(PdfPassword));
            }

            try
            {
                PDDocumentCatalog docCatalog = originialPdfDoc.getDocumentCatalog();
                PDAcroForm        acroForm   = docCatalog.getAcroForm();
                PDField           field      = acroForm.getField("Name");
                if (field != null)
                {
                    field.setValue("name");
                }
            }
            catch (java.io.IOException ex)
            {
                throw ex;
            }
            return(pdfText);
        }
Exemple #5
0
        /// <summary>
        /// The below method is an example from https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/util/AddWatermarkText.java?revision=1873147&view=markup
        /// </summary>
        /// <param name="doc"></param>
        /// <param name="page"></param>
        /// <param name="font"></param>
        /// <param name="text"></param>
        static void addWatermarkText(PDDocument doc, PDPage page, PDFont font, string text)
        {
            using (PDPageContentStream cs
                       = new PDPageContentStream(doc, page, PDPageContentStream.AppendMode.APPEND, true, true))
            {
                float fontHeight     = 100; // arbitrary for short text
                float width          = page.getMediaBox().getWidth();
                float height         = page.getMediaBox().getHeight();
                float stringWidth    = font.getStringWidth(text) / 1000 * fontHeight;
                float diagonalLength = (float)System.Math.Sqrt(width * width + height * height);
                float angle          = (float)System.Math.Atan2(height, width);
                float x = (diagonalLength - stringWidth) / 2; // "horizontal" position in rotated world
                float y = -fontHeight / 4;                    // 4 is a trial-and-error thing, this lowers the text a bit
                cs.transform(Matrix.getRotateInstance(angle, 0, 0));
                cs.setFont(font, fontHeight);
                // cs.setRenderingMode(RenderingMode.STROKE) // for "hollow" effect

                PDExtendedGraphicsState gs = new PDExtendedGraphicsState();
                gs.setNonStrokingAlphaConstant(new Float(0.2f));
                gs.setStrokingAlphaConstant(new Float(0.2f));
                gs.setBlendMode(BlendMode.MULTIPLY);
                gs.setLineWidth(new Float(3f));
                cs.setGraphicsStateParameters(gs);

                cs.setNonStrokingColor(Color.red);
                cs.setStrokingColor(Color.red);

                cs.beginText();
                cs.newLineAtOffset(x, y);
                cs.showText(text);
                cs.endText();
            }
        }
Exemple #6
0
        public static String PDFText(String PDFFilePath)
        {
            PDDocument      doc      = PDDocument.load(PDFFilePath);
            PDFTextStripper stripper = new PDFTextStripper();

            return(stripper.getText(doc));
        }
Exemple #7
0
        static void Main(string[] args)
        {
            if (args == null || args.Length < 2)
            {
                System.Console.WriteLine("Usage: " + AppDomain.CurrentDomain.FriendlyName + " <original PDF filename> <watermark text> [new PDF filename]" + Environment.NewLine +
                                         "  For example: " + AppDomain.CurrentDomain.FriendlyName + " myDoc.pdf \"This is a Draft\"");
            }
            else
            {
                string origName     = args[0];
                string watermarkTxt = args[1];
                if (!System.IO.File.Exists(origName))
                {
                    System.Console.WriteLine("Error: cannot find the original PDF file(" + origName + "). Please correct the filename or the path and try again.");
                }
                else
                {
                    PDDocument origDoc  = PDDocument.load(new java.io.File(origName)); // NOTE: PDDocument.load() only takes java.io.File, not System.IO.File from C#.Net
                    PDPageTree allPages = origDoc.getPages();
                    PDFont     font     = PDType1Font.HELVETICA_BOLD;
                    for (int i = 0, len = allPages.getCount(); i < len; ++i)
                    {
                        PDPage pg = (PDPage)allPages.get(i);
                        addWatermarkText(origDoc, pg, font, "This is a draft!!!");
                    }

                    origDoc.save("watermarked_" + origName);
                    origDoc.close();
                }
            }
        }
        public static Dictionary <int, string> Extract(string pdfFileName)
        {
            if (!File.Exists(pdfFileName))
            {
                throw new FileNotFoundException("pdfFileName");
            }

            var        result      = new Dictionary <int, string>();
            PDDocument pdfDocument = PDDocument.load(pdfFileName);

            var pdfStripper = new PDFTextStripper();

            pdfStripper.setPageSeparator(Environment.NewLine + Environment.NewLine);

            for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
            {
                pdfStripper.setStartPage(i);
                pdfStripper.setEndPage(i);

                //ExtractText(pdfStripper, pdfDocument,
                //  string.Format(@"c:\Users\tri.hoang\Desktop\temp\epub-belastingblad\2014-08\pdf\page_{0}.txt", i.ToString().PadLeft(5, '0')));

                result.Add(i, GetText(pdfStripper, pdfDocument));
            }

            pdfDocument.close();
            return(result);
        }
Exemple #9
0
 public PDFValidator(string fileName, string password)
 {
     try
     {
         this.IsValid = true;
         if (!System.IO.File.Exists(fileName))
         {
             Console.WriteLine("The PDF file does not Exist.");
         }
         else
         {
             PDDocument pDDocument = null;
             pDDocument = (!string.IsNullOrEmpty(password) ? PDDocument.load(new java.io.File(fileName), password) : PDDocument.load(new java.io.File(fileName)));
             if (pDDocument.isEncrypted())
             {
                 this.IsPasswordProtected = true;
             }
             this.CheckAllPages(pDDocument);
             if (pDDocument != null)
             {
                 pDDocument.close();
             }
         }
     }
     catch (InvalidPasswordException invalidPasswordException)
     {
         this.IsPasswordProtected = true;
         this.IsValid             = false;
     }
     catch (Exception exception)
     {
         this.ErrorMessage = string.Format("PDF analysis failed With exception {0}", exception.Message);
         this.IsValid      = false;
     }
 }
Exemple #10
0
        /// <summary>
        /// 读取
        /// </summary>
        /// <param name="file"></param>
        /// <returns></returns>
        public string Read(IFormFile file)
        {
            /*var ss = Directory.GetCurrentDirectory();//获取项目路径
             * var st = _hostingEnvironment.ContentRootPath;//获取项目路径
             * var fileDir = ss+"\\pdf";
             * if (Directory.Exists(fileDir))
             * {
             *  Directory.Delete(fileDir,true);
             * }
             * Directory.CreateDirectory(fileDir);
             * //string fileName = file.FileName;
             * //string filePath= fileDir + $@"\{fileName}";
             * var fileName = DateTime.Now.ToString("yyyyMMddHHmmss") +
             *           Path.GetExtension(file.FileName);
             * var filePath = Path.Combine(fileDir, fileName);
             * using (FileStream fs = System.IO.File.Create(filePath))
             * {
             *  file.CopyTo(fs);
             *  fs.Flush();
             * }
             * var files = new FileInfo(filePath);*/
            //  string currentDirectory = Path.GetDirectoryName((new PdfController()).GetType().Assembly.Location);

            PDDocument doc = PDDocument.load(@"G:/Read.pdf");

            PDFTextStripper pdfStripper = new PDFTextStripper();
            string          text        = pdfStripper.getText(doc);

            return(text);
        }
Exemple #11
0
        public static Info ReadDocInfo(string fileName)
        {
            Info result = new Info();

            try
            {
                PDDocument pDoc = PDDocument.load(fileName);

                PDDocumentInformation docInfo = pDoc.getDocumentInformation();

                if (docInfo != null)
                {
                    var author   = docInfo.getAuthor();
                    var title    = docInfo.getTitle();
                    var summary  = docInfo.getSubject();
                    var keywords = docInfo.getKeywords();

                    result.Author   = author;
                    result.Title    = title;
                    result.Summary  = summary;
                    result.Keywords = keywords;
                }
            }
            catch (Exception ex)
            {
            }
            return(result);
        }
        public Exception ShowZUGFeRD(string filepath, out string message)
        {
            try
            {
                PDDocument doc = PDDocument.load(filepath);

                // now check the contents (like MustangReaderTest)
                NE4ZUGFeRDImporter zi = new NE4ZUGFeRDImporter();
                zi.extract(filepath);

                // ZUGFeRD lesen
                if (zi.canParse())
                {
                    zi.parse();

                    // ZUGFeRD Daten als string zurück
                    message = string.Format("Menge: {0}\nRechnungsempfänger: {1}\nReferenz: {2}",
                                            zi.getAmount(), zi.getHolder(), zi.getForeignReference());
                }
                else
                {
                    message = "Keine ZUGFeRD Daten gefunden!";
                }

                //return ok
                return(null);
            }
            catch (Exception ex)
            {
                message = ex.InnerException.ToString();
                return(ex);
            }
        }
Exemple #13
0
        public string parseUsingPDFBox(string input)
        {
            PDDocument      doc      = PDDocument.load(input);
            PDFTextStripper stripper = new PDFTextStripper();

            return(stripper.getText(doc));
        }
Exemple #14
0
        private string GetTextFromPdfFile(string fileName)
        {
            PDDocument      doc      = PDDocument.load(fileName);
            PDFTextStripper stripper = new PDFTextStripper();

            return(stripper.getText(doc));
        }
Exemple #15
0
        /// <summary>
        /// 通过文件流方式解析PDF
        /// </summary>
        /// <param name="pdfStream">PDF流</param>
        /// <param name="tableContainType">表格包含样式</param>
        /// <returns></returns>
        public static PDFModel Parser(byte[] pdfStream, TableContainType tableContainType)
        {
            PDFModel fileContent = null;
            //打开文件
            PDDocument reader = null;

            try
            {
                InputStream sbs = new ByteArrayInputStream(pdfStream);
                reader      = PDDocument.load(sbs);
                fileContent = Parser(reader, tableContainType);
            }
            catch (Exception ex)
            {
                if (reader != null)
                {
                    reader.close();
                    reader = null;
                }

                return(null);
            }
            finally
            {
                reader.close();
                reader = null;
            }

            return(fileContent);
        }
Exemple #16
0
    public static String PDFText(String PDFFilePath)
    {
        PDDocument      doc      = PDDocument.load(PDFFilePath);
        PDFTextStripper stripper = new PDFTextStripper();
        var             text     = " ";

        try
        {
            text = stripper.getText(doc);
            return(text);
        }
        catch (UnauthorizedAccessException e)
        {
            MessageBox.Show("Невозможно скопировать текст из Пдф " + PDFFilePath + ". " + e.Message, "Сообщение об ошибке");
            return("");
        }
        catch (FileLoadException FLe)
        {
            MessageBox.Show("Невозможно загрузить Пдф " + PDFFilePath + ". " + FLe.Message, "Сообщение об ошибке");
            return("");
        }
        catch when(text == "")
        {
            MessageBox.Show("Невозможно загрузить Пдф " + PDFFilePath + ". ", "Сообщение об ошибке");
            return("");
        }
        finally
        {
            doc.close();
        }
    }
Exemple #17
0
 private void Split(int start, int end, int repeatEvery)
 {
     if (this.CheckOutput())
     {
         PDFHelper.DisplayTrialPopupIfNecessary();
         try
         {
             Splitter splitter = new Splitter();
             splitter.setStartPage(start);
             splitter.setEndPage(end);
             splitter.setSplitAtPage(repeatEvery);
             List list = splitter.split(this.pdfDocument.PDFBoxDocument);
             for (int i = 0; i < list.size(); i++)
             {
                 PDDocument pDDocument = PDFHelper.AddTrialStampIfNecessary((PDDocument)list.@get(i));
                 string     str        = string.Format("{0} [{1}].pdf", this.OutputFileName, i);
                 pDDocument.save(Path.Combine(this.OutputFilePath, str));
             }
         }
         catch (Exception exception1)
         {
             Exception exception = exception1;
             throw new PDFToolkitException(exception.Message, exception.InnerException);
         }
     }
 }
Exemple #18
0
        /// <summary>
        /// Validates the PDF File Downloaded and the text present in the file
        /// </summary>
        /// <param name="pdfFileName">The folder or filepath.</param>
        /// <param name="textToCheck">The text to be validated.</param>
        /// <returns>if the text is present or not</returns>
        public static bool ExtractAndValidateTextFromPDF(string pdfFileName, string textToCheck)
        {
            try
            {
                string          result       = string.Empty;
                bool            validateText = false;
                PDDocument      doc          = PDDocument.load(pdfFileName);
                PDFTextStripper stripper     = new PDFTextStripper();
                result = stripper.getText(doc);
                doc.close();

                if (result.Length != 0)
                {
                    byte[] bytes            = Encoding.Default.GetBytes(result.ToString());
                    string decodedresult    = Encoding.UTF8.GetString(bytes);
                    string outputPDF        = decodedresult.Replace(" ", null).Replace("\r", null).Replace("\n", null);
                    string validationString = textToCheck.Replace(" ", null);
                    validateText = outputPDF.Contains(validationString);
                }

                return(validateText);
            }
            catch (Exception)
            {
                throw;
            }
        }
        private void btnShowPDF_Click(object sender, EventArgs e)
        {
            PDDocument      PDF      = PDDocument.load(textBox1.Text);
            PDFTextStripper stripper = new PDFTextStripper();

            richTextBox1.Text = (stripper.getText(PDF));
        }
Exemple #20
0
        internal static bool EmbedPDFAttachment(PDFAttachmentItem attachment, PDDocument doc)
        {
            bool flag;

            try
            {
                PDEmbeddedFilesNameTreeNode pDEmbeddedFilesNameTreeNode = new PDEmbeddedFilesNameTreeNode();
                List arrayList = new ArrayList();
                PDComplexFileSpecification pDComplexFileSpecification = new PDComplexFileSpecification();
                pDComplexFileSpecification.setFile(System.IO.Path.GetFileName(attachment.filePath));
                java.io.File   file           = new java.io.File(attachment.filePath);
                byte[]         numArray       = Files.readAllBytes(file.toPath());
                PDEmbeddedFile pDEmbeddedFile = new PDEmbeddedFile(doc, new ByteArrayInputStream(numArray));
                pDEmbeddedFile.setSize((int)numArray.Length);
                pDEmbeddedFile.setCreationDate(new GregorianCalendar());
                pDComplexFileSpecification.setEmbeddedFile(pDEmbeddedFile);
                PDEmbeddedFilesNameTreeNode pDEmbeddedFilesNameTreeNode1 = new PDEmbeddedFilesNameTreeNode();
                pDEmbeddedFilesNameTreeNode1.setNames(Collections.singletonMap("My first attachment", pDComplexFileSpecification));
                arrayList.@add(pDEmbeddedFilesNameTreeNode1);
                pDEmbeddedFilesNameTreeNode.setKids(arrayList);
                PDDocumentNameDictionary pDDocumentNameDictionary = new PDDocumentNameDictionary(doc.getDocumentCatalog());
                pDDocumentNameDictionary.setEmbeddedFiles(pDEmbeddedFilesNameTreeNode);
                doc.getDocumentCatalog().setNames(pDDocumentNameDictionary);
                flag = true;
            }
            catch (Exception exception1)
            {
                Exception exception = exception1;
                throw new PDFToolkitException(exception.Message, exception);
            }
            return(flag);
        }
Exemple #21
0
        public override string[] ExtractKeyWordCandidatesFromFile()
        {
            string text = null;

            // If we have no bytes then we can't do anything.
            if (Bytes == null || Bytes.Length == 0)
            {
                // Log the problem.
                log.Error("Tried to extract creation date from empty bytes for file " + Name);
                return(null);
            }

            try
            {
                java.io.ByteArrayInputStream byteStream = new java.io.ByteArrayInputStream(Bytes);
                PDDocument doc = PDDocument.load(byteStream);

                // TODO Internationalize this conversion
                text = doc.getDocumentInformation().getKeywords();
            }
            catch (Exception e)
            {
                log.Warn("Failed to get the keywords from the PDF file " + Name, e);
            }

            string[] returnText = null;

            if (!string.IsNullOrEmpty(text))
            {
                returnText = text.Split(new char[] { ',', ';' });
            }

            return(returnText);
        }
Exemple #22
0
        public static Dictionary <int, string> Extract(string pdfFileName)
        {
            if (!File.Exists(pdfFileName))
            {
                throw new FileNotFoundException("pdfFileName");
            }

            var        result      = new Dictionary <int, string>();
            PDDocument pdfDocument = PDDocument.load(pdfFileName);

            var pdfStripper = new PDFTextStripper();

            pdfStripper.setPageSeparator(Environment.NewLine + Environment.NewLine);

            for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
            {
                pdfStripper.setStartPage(i);
                pdfStripper.setEndPage(i);

                result.Add(i, GetText(pdfStripper, pdfDocument));
            }

            pdfDocument.close();
            return(result);
        }
Exemple #23
0
        /// <summary>
        /// Get a thumbnail of the document, if possible
        /// </summary>
        /// <param name="sizeX">The maximum X size of the thumbnail</param>
        /// <param name="sizeY">The maximum y size of the thumbnail</param>
        /// <param name="forceFullSize">True if the thumbnail should be exatly XxY pixels and False if the thumbnail
        /// should fit inside a XxY box but should maintain its aspect ratio</param>
        /// <returns>A JPEG byte thumbnail or null if the thumbnail can´t be generated</returns>
        public override byte[] GetThumbnail(int sizeX, int sizeY, bool forceFullSize)
        {
            // If we have no bytes then we can't do anything.
            if (Bytes == null || Bytes.Length == 0)
            {
                return(null);
            }

            try
            {
                org.pdfbox.pdfviewer.PageDrawer pagedrawer = new
                                                             org.pdfbox.pdfviewer.PageDrawer();

                java.io.ByteArrayInputStream byteStream = new java.io.ByteArrayInputStream(Bytes);
                PDDocument     doc   = PDDocument.load(byteStream);
                int            count = doc.getNumberOfPages();
                java.util.List pages = doc.getDocumentCatalog().getAllPages();
                if (pages.size() > 0)
                {
                    PDPage page = pagedrawer.getPage();
                    java.awt.image.BufferedImage  image = page.convertToImage();
                    java.io.ByteArrayOutputStream os    = new java.io.ByteArrayOutputStream();
                    ImageIO.write(image, "jpg", os);
                    byte[] data = os.toByteArray();
                    return(data);
                }
            }
            catch (Exception e)
            {
                log.Error("Failed to get the thumbnail from the PDF file " + Name, e);
            }

            return(null);
        }
Exemple #24
0
        /// <summary>
        /// Get text from the binary using PDFBox
        /// </summary>
        /// <returns>The text of the binary or null if we could not process the text</returns>
        public override string GetTextFromDocumentBinary()
        {
            string text = null;

            // If we have no bytes then we can't do anything.
            if (Bytes == null || Bytes.Length == 0)
            {
                // Log the problem.
                log.Error("Tried to extract text from empty bytes for file " + Name);
                return(null);
            }

            try
            {
                java.io.ByteArrayInputStream byteStream = new java.io.ByteArrayInputStream(Bytes);
                PDDocument      doc      = PDDocument.load(byteStream);
                PDFTextStripper stripper = new PDFTextStripper();
                text = stripper.getText(doc);
            }
            catch (Exception e)
            {
                log.Error("Failed to get the text from the PDF file " + Name, e);
            }

            return(text);
        }
            public string parsePDF(string filepath)
            {
                PDDocument      document = PDDocument.load(filepath);
                PDFTextStripper stripper = new PDFTextStripper();

                return(stripper.getText(document));
            }
        static void Main(string[] args)
        {
            PDDocument      doc         = PDDocument.load("lopreacamasa.pdf");
            PDFTextStripper pdfStripper = new PDFTextStripper();

            Console.Write(pdfStripper.getText(doc));
        }
Exemple #27
0
        private void CheckAllPages(PDDocument doc)
        {
            int num = 0;

            try
            {
                this.NumberOfPagesDict = doc.getNumberOfPages();
                foreach (PDPage page in doc.getPages())
                {
                    if (page.getMediaBox() == null)
                    {
                        this.ErrorMessage = string.Format("Page number {0} has no media box", num);
                        this.IsValid      = false;
                    }
                    if (page.getResources() == null)
                    {
                        this.ErrorMessage = string.Format("Page number {0}, has no page resources", num);
                        this.IsValid      = false;
                    }
                    num++;
                }
                if (this.NumberOfPagesDict != num)
                {
                    this.ErrorMessage = string.Format("Page Number Mismatch between dictionary and actual document", new object[0]);
                    this.IsValid      = false;
                }
            }
            catch (Exception exception1)
            {
                Exception exception = exception1;
                this.ErrorMessage = string.Format("PDF analysis failed on page number {0},\nWith exception {1}", num, exception.Message);
                this.IsValid      = false;
            }
        }
Exemple #28
0
        public void ExtractText(string inpufFileName, string outputFileName)
        {
#if false
            IFilterTextReader.FilterReader reader = new FilterReader(inpufFileName);
            var data = reader.ReadToEnd();
            using (var writer = new StreamWriter(outputFileName, false, System.Text.Encoding.UTF8))
            {
                writer.Write(data);
            }
#else
            PDDocument doc = null;
            try
            {
                doc = PDDocument.load(inpufFileName);
                PDFTextStripper stripper = new PDFTextStripper();
                using (var writer = new StreamWriter(outputFileName, false, System.Text.Encoding.UTF8))
                {
                    writer.Write(stripper.getText(doc));
                }
            }
            finally
            {
                if (doc != null)
                {
                    doc.close();
                }
            }
#endif
        }
Exemple #29
0
        public PdfOcrResult Execute(byte[] bytes)
        {
            PDDocument document = null;

            try
            {
                LoadPdf(bytes, ref document);

                List allPages = document.getDocumentCatalog().getAllPages();
                if (allPages.size() == 0)
                {
                    throw new PdfNotReadableException("Pdf contains no readable content");
                }

                //only first page
                PDPage page = (PDPage)allPages.get(0);

                PDStream contents = page.getContents();
                if (contents == null)
                {
                    throw new PdfNotReadableException("Pdf contains no readable content");
                }

                var items = new PdfToCharacters().GetItems(page, page.findResources(), page.getContents().getStream());
                if (items.Count == 0)
                {
                    throw new PdfNotReadableException("Pdf contains no readable content");
                }

                var mediaBox = page.findMediaBox();

                var height     = mediaBox?.getHeight() ?? 0;
                var width      = mediaBox?.getWidth() ?? 0;
                var itemsArray = items.ToArray();

                var keywords = "";
                try
                {
                    keywords = document.getDocumentInformation()?.getKeywords();
                }
                catch (Exception) { } // we do not know if PDF box can fail on this, if there is no keywords or something else. We dont really care we just want the keywords if possible.

                return(new PdfOcrResult()
                {
                    Items = itemsArray, Height = height, Width = width, Keywords = keywords
                });
            }
            catch (PdfReadException)
            {
                throw;
            }
            catch (Exception e)
            {
                throw new PdfReadException("Pdf could not be loaded. It is not a redable pdf.", e);
            }
            finally
            {
                document?.close();
            }
        }
Exemple #30
0
        private static void KamilPdfTest(string input)
        {
            PDDocument doc = null;

            try
            {
                doc = PDDocument.load(input);
                PDFTextStripper stripper = new PDFTextStripper();
                // stripper.getText(doc);


                Matrix line = stripper.getTextLineMatrix();
                // int page_nr = stripper.getCurrentPageNo();
                PDPage page     = stripper.getCurrentPage();
                Matrix line2    = stripper.getTextMatrix();
                int    char_cnt = stripper.getTotalCharCnt();

                string article_start = stripper.getArticleStart();
                string article_end   = stripper.getArticleEnd();



                string pdf = stripper.getText(doc);                                     // wrzuca caly tekst do sringa - dziala
                char_cnt = pdf.Length;
            }
            finally
            {
                if (doc != null)
                {
                    doc.close();
                }
            }
        }
        protected internal String ConvertPDFToDoc(string PDFpath)
        {
            try
            {
                PDDocument      PDFdoc     = null;
                PDFTextStripper textstrip  = new PDFTextStripper();
                String          StringDocx = String.Empty;
                String          DocxPath   = String.Empty;

                PDFdoc     = PDDocument.load(PDFpath);
                StringDocx = textstrip.getText(PDFdoc);
                PDFdoc.close(); //cierra el pdf

                ///DocxPath = fn.CreateFolderToSaveDocs(fn.GenerateName()); ///genera la ruta para guardar el archivo.
                DocxPath = fn.CreateFolderToSaveDocs(fileName); ///genera la ruta para guardar el archivo.
                var wordDoc = DocX.Create(DocxPath);
                wordDoc.InsertParagraph(StringDocx);
                wordDoc.Save();
                ////Process.Start("winword.exe", DocxPath);
                return(DocxPath);
            }
            catch (Exception)
            {
                return("");
            }
        }
        /// <summary>
        /// Tries to decrypt a document with the given passwords.
        /// </summary>
        /// <param name="doc">Document of type PDDocument.</param>
        /// <param name="passwords">Passwords of type string array.</param>
        /// <returns>Decrypted document (PDDocument) or null if decryption fails.</returns>
        private static PDDocument Decrypt(PDDocument doc, ICollection<string> passwords)
        {
            if (!passwords.Any())
            {
                throw new ApplicationException("PDfUtil :: Decrypt :: supplied empty password collection");
            }

            foreach (var password in passwords)
            {
                Log.Debug("PdfUtil :: trying to decrypt with Password: [" + password + "]", typeof(PdfUtil));
                var tmpdoc = Decrypt(doc, password);
                if (tmpdoc != null)
                {
                    return tmpdoc;
                }
            }

            return null;
        }
        public string Parse(string fileName)
        {
            //Load in file. Using java.io because pdfbox is ported from java.
            var pdfFile = new FileInputStream(fileName);

            //Load file into the pdf parser
            var pdfParser = new PDFParser(pdfFile);

            //Parse the document, so that we can get it for the COSDocument
            pdfParser.parse();

            /*
            COSDocument is the in-memory representation of the PDF.
            see https://pdfbox.apache.org/docs/1.8.4/javadocs/org/apache/pdfbox/cos/COSDocument.html
            */
            var cosDocument = pdfParser.getDocument();

            var pdDocument = new PDDocument(cosDocument);

            //Instantiate text stripper.
            var pdfTextStripper = new PDFTextStripper();

            /* Needed for only stripping specific pages

            pdfTextStripper.setStartPage(0);
            pdfTextStripper.setEndPage(pdDocument.getNumberOfPages());

            */

            //Needed so that we can close the pdDocument before returning from this method
            var strippedText = pdfTextStripper.getText(pdDocument);

            //This closes all storage and delete the tmp files.
            pdDocument.close();
            cosDocument.close();

            return strippedText;
        }
 public void Initialize()
 {
     _pdfDocument = PDDocument.load(_pdfDocumentName);
 }
Exemple #35
0
 private PdfTextReader(PDDocument document)
 {
     this.document = document;
 }
        /// <summary>
        /// Tries to decrypt a document with the given password
        /// </summary>
        /// <param name="doc">Document of type PDDocument</param>
        /// <param name="password">Password of type string</param>
        /// <returns>decrypted document (PDDocument) or null if decryption fails</returns>
        private static PDDocument Decrypt(PDDocument doc, string password)
        {
            var standardDecryptionMaterial = new StandardDecryptionMaterial(password);

            try
            {
                doc.openProtection(standardDecryptionMaterial);
                return doc;
            }
            catch (Exception ex)
            {
                Log.Debug("PdfUtil :: Decryption failed", ex);
                return null;
            }
        }
 public Dictionary<string, PDFField> DiscoverPDFFormFields(PDDocument document)
 {
     return GetPDFFormFields(document, false);
 }
        public Dictionary<string, PDFField> GetPDFFormFields(PDDocument document, bool includeValues)
        {
            Dictionary<string, PDFField> pdfFormFields = new Dictionary<string, PDFField>();

            PDDocumentCatalog docCat = document.getDocumentCatalog();
            PDAcroForm form = docCat.getAcroForm();

            string aa = string.Empty;
            var a = form.getFields();

            var iterator = a.iterator();

            while (iterator.hasNext())
            {
                try
                {
                    PDFField pdffield = new PDFField();

                    PDField f = (PDField)iterator.next();

                    pdffield.Type = f.getFieldType();
                    pdffield.IsRequired = f.isRequired();
                    pdffield.IsReadOnly = f.isReadonly();
                    pdffield.FullName = f.getFullyQualifiedName();
                    pdffield.AlternativeName = f.getAlternateFieldName();
                    pdffield.PartialName = f.getPartialName();

                    string fieldvalue = string.Empty;

                    // sig field throws not implemented in ver 1.2.1
                    if (includeValues)
                    {
                        try
                        {
                            fieldvalue = f.getValue();
                        }
                        catch (Exception e) { }

                    }

                    if (pdffield.Type == "Sig")
                    {
                        PDSignatureField sig = (PDSignatureField)f;
                        var x = sig.getSignature();
                        if (x != null)
                        {
                            fieldvalue = x.getName();
                        }
                    }

                    pdffield.FieldValue = fieldvalue;

                    pdfFormFields.Add(pdffield.FullName, pdffield);
                }
                catch (Exception e) { }
            }

            return pdfFormFields;
        }