public string PdfToText() { string pdfText = String.Empty; PDFParser parser = new PDFParser(new BufferedInputStream(new FileInputStream(PdfFile))); parser.parse(); PDDocument originialPdfDoc = parser.getPDDocument(); bool isOriginalDocEncrypted = originialPdfDoc.isEncrypted(); if (isOriginalDocEncrypted) { originialPdfDoc.openProtection(new StandardDecryptionMaterial(PdfPassword)); } PDFTextStripper stripper = new PDFTextStripper(); try { pdfText = stripper.getText(originialPdfDoc); } catch (java.io.IOException ex) { throw ex; } return(pdfText); }
/** * 提取部分页面文本 * @param file pdf文档路径 * @param startPage 开始页数 * @param endPage 结束页数 */ public static string ExtractTXT(String file, int startPage, int endPage) { String content = string.Empty; try { PDDocument document = PDDocument.load(file); //获取一个PDFTextStripper文本剥离对象 PDFTextStripper stripper = new PDFTextStripper(); // 设置按顺序输出 stripper.setSortByPosition(true); // 设置起始页 stripper.setStartPage(startPage); // 设置结束页 stripper.setEndPage(endPage); //获取文本内容 content = stripper.getText(document); document.close(); } catch (java.io.FileNotFoundException ex) { } catch (java.io.IOException ex) { } return(content); }
public static PDFInfo GetPDFDoucmentInformation(PDDocument document) { PDFInfo i = new PDFInfo(); PDDocumentInformation info = document.getDocumentInformation(); i.Author = info.getAuthor(); if (info.getCreationDate() != null) { DateTime dt = Utilities.Utils.GetDateFromJava(info.getCreationDate()); i.CreationDate = dt.ToLongDateString() + " " + dt.ToLongTimeString(); } i.Creator = info.getCreator(); i.Keywords = info.getKeywords(); if (info.getModificationDate() != null) { DateTime dt = Utilities.Utils.GetDateFromJava(info.getModificationDate()); i.ModificationDate = dt.ToLongDateString() + " " + dt.ToLongTimeString(); } i.Producer = info.getProducer(); i.Subject = info.getSubject(); i.Title = info.getTitle(); i.Trapped = info.getTrapped(); i.NumberOfPages = document.getNumberOfPages(); return i; }
public string PdfFields() { string pdfText = String.Empty; PDFParser parser = new PDFParser(new BufferedInputStream(new FileInputStream(PdfFile))); parser.parse(); PDDocument originialPdfDoc = parser.getPDDocument(); bool isOriginalDocEncrypted = originialPdfDoc.isEncrypted(); if (isOriginalDocEncrypted) { originialPdfDoc.openProtection(new StandardDecryptionMaterial(PdfPassword)); } try { PDDocumentCatalog docCatalog = originialPdfDoc.getDocumentCatalog(); PDAcroForm acroForm = docCatalog.getAcroForm(); PDField field = acroForm.getField("Name"); if (field != null) { field.setValue("name"); } } catch (java.io.IOException ex) { throw ex; } return(pdfText); }
/// <summary> /// The below method is an example from https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/util/AddWatermarkText.java?revision=1873147&view=markup /// </summary> /// <param name="doc"></param> /// <param name="page"></param> /// <param name="font"></param> /// <param name="text"></param> static void addWatermarkText(PDDocument doc, PDPage page, PDFont font, string text) { using (PDPageContentStream cs = new PDPageContentStream(doc, page, PDPageContentStream.AppendMode.APPEND, true, true)) { float fontHeight = 100; // arbitrary for short text float width = page.getMediaBox().getWidth(); float height = page.getMediaBox().getHeight(); float stringWidth = font.getStringWidth(text) / 1000 * fontHeight; float diagonalLength = (float)System.Math.Sqrt(width * width + height * height); float angle = (float)System.Math.Atan2(height, width); float x = (diagonalLength - stringWidth) / 2; // "horizontal" position in rotated world float y = -fontHeight / 4; // 4 is a trial-and-error thing, this lowers the text a bit cs.transform(Matrix.getRotateInstance(angle, 0, 0)); cs.setFont(font, fontHeight); // cs.setRenderingMode(RenderingMode.STROKE) // for "hollow" effect PDExtendedGraphicsState gs = new PDExtendedGraphicsState(); gs.setNonStrokingAlphaConstant(new Float(0.2f)); gs.setStrokingAlphaConstant(new Float(0.2f)); gs.setBlendMode(BlendMode.MULTIPLY); gs.setLineWidth(new Float(3f)); cs.setGraphicsStateParameters(gs); cs.setNonStrokingColor(Color.red); cs.setStrokingColor(Color.red); cs.beginText(); cs.newLineAtOffset(x, y); cs.showText(text); cs.endText(); } }
public static String PDFText(String PDFFilePath) { PDDocument doc = PDDocument.load(PDFFilePath); PDFTextStripper stripper = new PDFTextStripper(); return(stripper.getText(doc)); }
static void Main(string[] args) { if (args == null || args.Length < 2) { System.Console.WriteLine("Usage: " + AppDomain.CurrentDomain.FriendlyName + " <original PDF filename> <watermark text> [new PDF filename]" + Environment.NewLine + " For example: " + AppDomain.CurrentDomain.FriendlyName + " myDoc.pdf \"This is a Draft\""); } else { string origName = args[0]; string watermarkTxt = args[1]; if (!System.IO.File.Exists(origName)) { System.Console.WriteLine("Error: cannot find the original PDF file(" + origName + "). Please correct the filename or the path and try again."); } else { PDDocument origDoc = PDDocument.load(new java.io.File(origName)); // NOTE: PDDocument.load() only takes java.io.File, not System.IO.File from C#.Net PDPageTree allPages = origDoc.getPages(); PDFont font = PDType1Font.HELVETICA_BOLD; for (int i = 0, len = allPages.getCount(); i < len; ++i) { PDPage pg = (PDPage)allPages.get(i); addWatermarkText(origDoc, pg, font, "This is a draft!!!"); } origDoc.save("watermarked_" + origName); origDoc.close(); } } }
public static Dictionary <int, string> Extract(string pdfFileName) { if (!File.Exists(pdfFileName)) { throw new FileNotFoundException("pdfFileName"); } var result = new Dictionary <int, string>(); PDDocument pdfDocument = PDDocument.load(pdfFileName); var pdfStripper = new PDFTextStripper(); pdfStripper.setPageSeparator(Environment.NewLine + Environment.NewLine); for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) { pdfStripper.setStartPage(i); pdfStripper.setEndPage(i); //ExtractText(pdfStripper, pdfDocument, // string.Format(@"c:\Users\tri.hoang\Desktop\temp\epub-belastingblad\2014-08\pdf\page_{0}.txt", i.ToString().PadLeft(5, '0'))); result.Add(i, GetText(pdfStripper, pdfDocument)); } pdfDocument.close(); return(result); }
public PDFValidator(string fileName, string password) { try { this.IsValid = true; if (!System.IO.File.Exists(fileName)) { Console.WriteLine("The PDF file does not Exist."); } else { PDDocument pDDocument = null; pDDocument = (!string.IsNullOrEmpty(password) ? PDDocument.load(new java.io.File(fileName), password) : PDDocument.load(new java.io.File(fileName))); if (pDDocument.isEncrypted()) { this.IsPasswordProtected = true; } this.CheckAllPages(pDDocument); if (pDDocument != null) { pDDocument.close(); } } } catch (InvalidPasswordException invalidPasswordException) { this.IsPasswordProtected = true; this.IsValid = false; } catch (Exception exception) { this.ErrorMessage = string.Format("PDF analysis failed With exception {0}", exception.Message); this.IsValid = false; } }
/// <summary> /// 读取 /// </summary> /// <param name="file"></param> /// <returns></returns> public string Read(IFormFile file) { /*var ss = Directory.GetCurrentDirectory();//获取项目路径 * var st = _hostingEnvironment.ContentRootPath;//获取项目路径 * var fileDir = ss+"\\pdf"; * if (Directory.Exists(fileDir)) * { * Directory.Delete(fileDir,true); * } * Directory.CreateDirectory(fileDir); * //string fileName = file.FileName; * //string filePath= fileDir + $@"\{fileName}"; * var fileName = DateTime.Now.ToString("yyyyMMddHHmmss") + * Path.GetExtension(file.FileName); * var filePath = Path.Combine(fileDir, fileName); * using (FileStream fs = System.IO.File.Create(filePath)) * { * file.CopyTo(fs); * fs.Flush(); * } * var files = new FileInfo(filePath);*/ // string currentDirectory = Path.GetDirectoryName((new PdfController()).GetType().Assembly.Location); PDDocument doc = PDDocument.load(@"G:/Read.pdf"); PDFTextStripper pdfStripper = new PDFTextStripper(); string text = pdfStripper.getText(doc); return(text); }
public static Info ReadDocInfo(string fileName) { Info result = new Info(); try { PDDocument pDoc = PDDocument.load(fileName); PDDocumentInformation docInfo = pDoc.getDocumentInformation(); if (docInfo != null) { var author = docInfo.getAuthor(); var title = docInfo.getTitle(); var summary = docInfo.getSubject(); var keywords = docInfo.getKeywords(); result.Author = author; result.Title = title; result.Summary = summary; result.Keywords = keywords; } } catch (Exception ex) { } return(result); }
public Exception ShowZUGFeRD(string filepath, out string message) { try { PDDocument doc = PDDocument.load(filepath); // now check the contents (like MustangReaderTest) NE4ZUGFeRDImporter zi = new NE4ZUGFeRDImporter(); zi.extract(filepath); // ZUGFeRD lesen if (zi.canParse()) { zi.parse(); // ZUGFeRD Daten als string zurück message = string.Format("Menge: {0}\nRechnungsempfänger: {1}\nReferenz: {2}", zi.getAmount(), zi.getHolder(), zi.getForeignReference()); } else { message = "Keine ZUGFeRD Daten gefunden!"; } //return ok return(null); } catch (Exception ex) { message = ex.InnerException.ToString(); return(ex); } }
public string parseUsingPDFBox(string input) { PDDocument doc = PDDocument.load(input); PDFTextStripper stripper = new PDFTextStripper(); return(stripper.getText(doc)); }
private string GetTextFromPdfFile(string fileName) { PDDocument doc = PDDocument.load(fileName); PDFTextStripper stripper = new PDFTextStripper(); return(stripper.getText(doc)); }
/// <summary> /// 通过文件流方式解析PDF /// </summary> /// <param name="pdfStream">PDF流</param> /// <param name="tableContainType">表格包含样式</param> /// <returns></returns> public static PDFModel Parser(byte[] pdfStream, TableContainType tableContainType) { PDFModel fileContent = null; //打开文件 PDDocument reader = null; try { InputStream sbs = new ByteArrayInputStream(pdfStream); reader = PDDocument.load(sbs); fileContent = Parser(reader, tableContainType); } catch (Exception ex) { if (reader != null) { reader.close(); reader = null; } return(null); } finally { reader.close(); reader = null; } return(fileContent); }
public static String PDFText(String PDFFilePath) { PDDocument doc = PDDocument.load(PDFFilePath); PDFTextStripper stripper = new PDFTextStripper(); var text = " "; try { text = stripper.getText(doc); return(text); } catch (UnauthorizedAccessException e) { MessageBox.Show("Невозможно скопировать текст из Пдф " + PDFFilePath + ". " + e.Message, "Сообщение об ошибке"); return(""); } catch (FileLoadException FLe) { MessageBox.Show("Невозможно загрузить Пдф " + PDFFilePath + ". " + FLe.Message, "Сообщение об ошибке"); return(""); } catch when(text == "") { MessageBox.Show("Невозможно загрузить Пдф " + PDFFilePath + ". ", "Сообщение об ошибке"); return(""); } finally { doc.close(); } }
private void Split(int start, int end, int repeatEvery) { if (this.CheckOutput()) { PDFHelper.DisplayTrialPopupIfNecessary(); try { Splitter splitter = new Splitter(); splitter.setStartPage(start); splitter.setEndPage(end); splitter.setSplitAtPage(repeatEvery); List list = splitter.split(this.pdfDocument.PDFBoxDocument); for (int i = 0; i < list.size(); i++) { PDDocument pDDocument = PDFHelper.AddTrialStampIfNecessary((PDDocument)list.@get(i)); string str = string.Format("{0} [{1}].pdf", this.OutputFileName, i); pDDocument.save(Path.Combine(this.OutputFilePath, str)); } } catch (Exception exception1) { Exception exception = exception1; throw new PDFToolkitException(exception.Message, exception.InnerException); } } }
/// <summary> /// Validates the PDF File Downloaded and the text present in the file /// </summary> /// <param name="pdfFileName">The folder or filepath.</param> /// <param name="textToCheck">The text to be validated.</param> /// <returns>if the text is present or not</returns> public static bool ExtractAndValidateTextFromPDF(string pdfFileName, string textToCheck) { try { string result = string.Empty; bool validateText = false; PDDocument doc = PDDocument.load(pdfFileName); PDFTextStripper stripper = new PDFTextStripper(); result = stripper.getText(doc); doc.close(); if (result.Length != 0) { byte[] bytes = Encoding.Default.GetBytes(result.ToString()); string decodedresult = Encoding.UTF8.GetString(bytes); string outputPDF = decodedresult.Replace(" ", null).Replace("\r", null).Replace("\n", null); string validationString = textToCheck.Replace(" ", null); validateText = outputPDF.Contains(validationString); } return(validateText); } catch (Exception) { throw; } }
private void btnShowPDF_Click(object sender, EventArgs e) { PDDocument PDF = PDDocument.load(textBox1.Text); PDFTextStripper stripper = new PDFTextStripper(); richTextBox1.Text = (stripper.getText(PDF)); }
internal static bool EmbedPDFAttachment(PDFAttachmentItem attachment, PDDocument doc) { bool flag; try { PDEmbeddedFilesNameTreeNode pDEmbeddedFilesNameTreeNode = new PDEmbeddedFilesNameTreeNode(); List arrayList = new ArrayList(); PDComplexFileSpecification pDComplexFileSpecification = new PDComplexFileSpecification(); pDComplexFileSpecification.setFile(System.IO.Path.GetFileName(attachment.filePath)); java.io.File file = new java.io.File(attachment.filePath); byte[] numArray = Files.readAllBytes(file.toPath()); PDEmbeddedFile pDEmbeddedFile = new PDEmbeddedFile(doc, new ByteArrayInputStream(numArray)); pDEmbeddedFile.setSize((int)numArray.Length); pDEmbeddedFile.setCreationDate(new GregorianCalendar()); pDComplexFileSpecification.setEmbeddedFile(pDEmbeddedFile); PDEmbeddedFilesNameTreeNode pDEmbeddedFilesNameTreeNode1 = new PDEmbeddedFilesNameTreeNode(); pDEmbeddedFilesNameTreeNode1.setNames(Collections.singletonMap("My first attachment", pDComplexFileSpecification)); arrayList.@add(pDEmbeddedFilesNameTreeNode1); pDEmbeddedFilesNameTreeNode.setKids(arrayList); PDDocumentNameDictionary pDDocumentNameDictionary = new PDDocumentNameDictionary(doc.getDocumentCatalog()); pDDocumentNameDictionary.setEmbeddedFiles(pDEmbeddedFilesNameTreeNode); doc.getDocumentCatalog().setNames(pDDocumentNameDictionary); flag = true; } catch (Exception exception1) { Exception exception = exception1; throw new PDFToolkitException(exception.Message, exception); } return(flag); }
public override string[] ExtractKeyWordCandidatesFromFile() { string text = null; // If we have no bytes then we can't do anything. if (Bytes == null || Bytes.Length == 0) { // Log the problem. log.Error("Tried to extract creation date from empty bytes for file " + Name); return(null); } try { java.io.ByteArrayInputStream byteStream = new java.io.ByteArrayInputStream(Bytes); PDDocument doc = PDDocument.load(byteStream); // TODO Internationalize this conversion text = doc.getDocumentInformation().getKeywords(); } catch (Exception e) { log.Warn("Failed to get the keywords from the PDF file " + Name, e); } string[] returnText = null; if (!string.IsNullOrEmpty(text)) { returnText = text.Split(new char[] { ',', ';' }); } return(returnText); }
public static Dictionary <int, string> Extract(string pdfFileName) { if (!File.Exists(pdfFileName)) { throw new FileNotFoundException("pdfFileName"); } var result = new Dictionary <int, string>(); PDDocument pdfDocument = PDDocument.load(pdfFileName); var pdfStripper = new PDFTextStripper(); pdfStripper.setPageSeparator(Environment.NewLine + Environment.NewLine); for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) { pdfStripper.setStartPage(i); pdfStripper.setEndPage(i); result.Add(i, GetText(pdfStripper, pdfDocument)); } pdfDocument.close(); return(result); }
/// <summary> /// Get a thumbnail of the document, if possible /// </summary> /// <param name="sizeX">The maximum X size of the thumbnail</param> /// <param name="sizeY">The maximum y size of the thumbnail</param> /// <param name="forceFullSize">True if the thumbnail should be exatly XxY pixels and False if the thumbnail /// should fit inside a XxY box but should maintain its aspect ratio</param> /// <returns>A JPEG byte thumbnail or null if the thumbnail can´t be generated</returns> public override byte[] GetThumbnail(int sizeX, int sizeY, bool forceFullSize) { // If we have no bytes then we can't do anything. if (Bytes == null || Bytes.Length == 0) { return(null); } try { org.pdfbox.pdfviewer.PageDrawer pagedrawer = new org.pdfbox.pdfviewer.PageDrawer(); java.io.ByteArrayInputStream byteStream = new java.io.ByteArrayInputStream(Bytes); PDDocument doc = PDDocument.load(byteStream); int count = doc.getNumberOfPages(); java.util.List pages = doc.getDocumentCatalog().getAllPages(); if (pages.size() > 0) { PDPage page = pagedrawer.getPage(); java.awt.image.BufferedImage image = page.convertToImage(); java.io.ByteArrayOutputStream os = new java.io.ByteArrayOutputStream(); ImageIO.write(image, "jpg", os); byte[] data = os.toByteArray(); return(data); } } catch (Exception e) { log.Error("Failed to get the thumbnail from the PDF file " + Name, e); } return(null); }
/// <summary> /// Get text from the binary using PDFBox /// </summary> /// <returns>The text of the binary or null if we could not process the text</returns> public override string GetTextFromDocumentBinary() { string text = null; // If we have no bytes then we can't do anything. if (Bytes == null || Bytes.Length == 0) { // Log the problem. log.Error("Tried to extract text from empty bytes for file " + Name); return(null); } try { java.io.ByteArrayInputStream byteStream = new java.io.ByteArrayInputStream(Bytes); PDDocument doc = PDDocument.load(byteStream); PDFTextStripper stripper = new PDFTextStripper(); text = stripper.getText(doc); } catch (Exception e) { log.Error("Failed to get the text from the PDF file " + Name, e); } return(text); }
public string parsePDF(string filepath) { PDDocument document = PDDocument.load(filepath); PDFTextStripper stripper = new PDFTextStripper(); return(stripper.getText(document)); }
static void Main(string[] args) { PDDocument doc = PDDocument.load("lopreacamasa.pdf"); PDFTextStripper pdfStripper = new PDFTextStripper(); Console.Write(pdfStripper.getText(doc)); }
private void CheckAllPages(PDDocument doc) { int num = 0; try { this.NumberOfPagesDict = doc.getNumberOfPages(); foreach (PDPage page in doc.getPages()) { if (page.getMediaBox() == null) { this.ErrorMessage = string.Format("Page number {0} has no media box", num); this.IsValid = false; } if (page.getResources() == null) { this.ErrorMessage = string.Format("Page number {0}, has no page resources", num); this.IsValid = false; } num++; } if (this.NumberOfPagesDict != num) { this.ErrorMessage = string.Format("Page Number Mismatch between dictionary and actual document", new object[0]); this.IsValid = false; } } catch (Exception exception1) { Exception exception = exception1; this.ErrorMessage = string.Format("PDF analysis failed on page number {0},\nWith exception {1}", num, exception.Message); this.IsValid = false; } }
public void ExtractText(string inpufFileName, string outputFileName) { #if false IFilterTextReader.FilterReader reader = new FilterReader(inpufFileName); var data = reader.ReadToEnd(); using (var writer = new StreamWriter(outputFileName, false, System.Text.Encoding.UTF8)) { writer.Write(data); } #else PDDocument doc = null; try { doc = PDDocument.load(inpufFileName); PDFTextStripper stripper = new PDFTextStripper(); using (var writer = new StreamWriter(outputFileName, false, System.Text.Encoding.UTF8)) { writer.Write(stripper.getText(doc)); } } finally { if (doc != null) { doc.close(); } } #endif }
public PdfOcrResult Execute(byte[] bytes) { PDDocument document = null; try { LoadPdf(bytes, ref document); List allPages = document.getDocumentCatalog().getAllPages(); if (allPages.size() == 0) { throw new PdfNotReadableException("Pdf contains no readable content"); } //only first page PDPage page = (PDPage)allPages.get(0); PDStream contents = page.getContents(); if (contents == null) { throw new PdfNotReadableException("Pdf contains no readable content"); } var items = new PdfToCharacters().GetItems(page, page.findResources(), page.getContents().getStream()); if (items.Count == 0) { throw new PdfNotReadableException("Pdf contains no readable content"); } var mediaBox = page.findMediaBox(); var height = mediaBox?.getHeight() ?? 0; var width = mediaBox?.getWidth() ?? 0; var itemsArray = items.ToArray(); var keywords = ""; try { keywords = document.getDocumentInformation()?.getKeywords(); } catch (Exception) { } // we do not know if PDF box can fail on this, if there is no keywords or something else. We dont really care we just want the keywords if possible. return(new PdfOcrResult() { Items = itemsArray, Height = height, Width = width, Keywords = keywords }); } catch (PdfReadException) { throw; } catch (Exception e) { throw new PdfReadException("Pdf could not be loaded. It is not a redable pdf.", e); } finally { document?.close(); } }
private static void KamilPdfTest(string input) { PDDocument doc = null; try { doc = PDDocument.load(input); PDFTextStripper stripper = new PDFTextStripper(); // stripper.getText(doc); Matrix line = stripper.getTextLineMatrix(); // int page_nr = stripper.getCurrentPageNo(); PDPage page = stripper.getCurrentPage(); Matrix line2 = stripper.getTextMatrix(); int char_cnt = stripper.getTotalCharCnt(); string article_start = stripper.getArticleStart(); string article_end = stripper.getArticleEnd(); string pdf = stripper.getText(doc); // wrzuca caly tekst do sringa - dziala char_cnt = pdf.Length; } finally { if (doc != null) { doc.close(); } } }
protected internal String ConvertPDFToDoc(string PDFpath) { try { PDDocument PDFdoc = null; PDFTextStripper textstrip = new PDFTextStripper(); String StringDocx = String.Empty; String DocxPath = String.Empty; PDFdoc = PDDocument.load(PDFpath); StringDocx = textstrip.getText(PDFdoc); PDFdoc.close(); //cierra el pdf ///DocxPath = fn.CreateFolderToSaveDocs(fn.GenerateName()); ///genera la ruta para guardar el archivo. DocxPath = fn.CreateFolderToSaveDocs(fileName); ///genera la ruta para guardar el archivo. var wordDoc = DocX.Create(DocxPath); wordDoc.InsertParagraph(StringDocx); wordDoc.Save(); ////Process.Start("winword.exe", DocxPath); return(DocxPath); } catch (Exception) { return(""); } }
/// <summary> /// Tries to decrypt a document with the given passwords. /// </summary> /// <param name="doc">Document of type PDDocument.</param> /// <param name="passwords">Passwords of type string array.</param> /// <returns>Decrypted document (PDDocument) or null if decryption fails.</returns> private static PDDocument Decrypt(PDDocument doc, ICollection<string> passwords) { if (!passwords.Any()) { throw new ApplicationException("PDfUtil :: Decrypt :: supplied empty password collection"); } foreach (var password in passwords) { Log.Debug("PdfUtil :: trying to decrypt with Password: [" + password + "]", typeof(PdfUtil)); var tmpdoc = Decrypt(doc, password); if (tmpdoc != null) { return tmpdoc; } } return null; }
public string Parse(string fileName) { //Load in file. Using java.io because pdfbox is ported from java. var pdfFile = new FileInputStream(fileName); //Load file into the pdf parser var pdfParser = new PDFParser(pdfFile); //Parse the document, so that we can get it for the COSDocument pdfParser.parse(); /* COSDocument is the in-memory representation of the PDF. see https://pdfbox.apache.org/docs/1.8.4/javadocs/org/apache/pdfbox/cos/COSDocument.html */ var cosDocument = pdfParser.getDocument(); var pdDocument = new PDDocument(cosDocument); //Instantiate text stripper. var pdfTextStripper = new PDFTextStripper(); /* Needed for only stripping specific pages pdfTextStripper.setStartPage(0); pdfTextStripper.setEndPage(pdDocument.getNumberOfPages()); */ //Needed so that we can close the pdDocument before returning from this method var strippedText = pdfTextStripper.getText(pdDocument); //This closes all storage and delete the tmp files. pdDocument.close(); cosDocument.close(); return strippedText; }
public void Initialize() { _pdfDocument = PDDocument.load(_pdfDocumentName); }
private PdfTextReader(PDDocument document) { this.document = document; }
/// <summary> /// Tries to decrypt a document with the given password /// </summary> /// <param name="doc">Document of type PDDocument</param> /// <param name="password">Password of type string</param> /// <returns>decrypted document (PDDocument) or null if decryption fails</returns> private static PDDocument Decrypt(PDDocument doc, string password) { var standardDecryptionMaterial = new StandardDecryptionMaterial(password); try { doc.openProtection(standardDecryptionMaterial); return doc; } catch (Exception ex) { Log.Debug("PdfUtil :: Decryption failed", ex); return null; } }
public Dictionary<string, PDFField> DiscoverPDFFormFields(PDDocument document) { return GetPDFFormFields(document, false); }
public Dictionary<string, PDFField> GetPDFFormFields(PDDocument document, bool includeValues) { Dictionary<string, PDFField> pdfFormFields = new Dictionary<string, PDFField>(); PDDocumentCatalog docCat = document.getDocumentCatalog(); PDAcroForm form = docCat.getAcroForm(); string aa = string.Empty; var a = form.getFields(); var iterator = a.iterator(); while (iterator.hasNext()) { try { PDFField pdffield = new PDFField(); PDField f = (PDField)iterator.next(); pdffield.Type = f.getFieldType(); pdffield.IsRequired = f.isRequired(); pdffield.IsReadOnly = f.isReadonly(); pdffield.FullName = f.getFullyQualifiedName(); pdffield.AlternativeName = f.getAlternateFieldName(); pdffield.PartialName = f.getPartialName(); string fieldvalue = string.Empty; // sig field throws not implemented in ver 1.2.1 if (includeValues) { try { fieldvalue = f.getValue(); } catch (Exception e) { } } if (pdffield.Type == "Sig") { PDSignatureField sig = (PDSignatureField)f; var x = sig.getSignature(); if (x != null) { fieldvalue = x.getName(); } } pdffield.FieldValue = fieldvalue; pdfFormFields.Add(pdffield.FullName, pdffield); } catch (Exception e) { } } return pdfFormFields; }