public PdfOcrResult Execute(byte[] bytes) { PDDocument document = null; try { LoadPdf(bytes, ref document); List allPages = document.getDocumentCatalog().getAllPages(); if (allPages.size() == 0) { throw new PdfNotReadableException("Pdf contains no readable content"); } //only first page PDPage page = (PDPage)allPages.get(0); PDStream contents = page.getContents(); if (contents == null) { throw new PdfNotReadableException("Pdf contains no readable content"); } var items = new PdfToCharacters().GetItems(page, page.findResources(), page.getContents().getStream()); if (items.Count == 0) { throw new PdfNotReadableException("Pdf contains no readable content"); } var mediaBox = page.findMediaBox(); var height = mediaBox?.getHeight() ?? 0; var width = mediaBox?.getWidth() ?? 0; var itemsArray = items.ToArray(); var keywords = ""; try { keywords = document.getDocumentInformation()?.getKeywords(); } catch (Exception) { } // we do not know if PDF box can fail on this, if there is no keywords or something else. We dont really care we just want the keywords if possible. return(new PdfOcrResult() { Items = itemsArray, Height = height, Width = width, Keywords = keywords }); } catch (PdfReadException) { throw; } catch (Exception e) { throw new PdfReadException("Pdf could not be loaded. It is not a redable pdf.", e); } finally { document?.close(); } }
public static Info ReadDocInfo(string fileName) { Info result = new Info(); try { PDDocument pDoc = PDDocument.load(fileName); PDDocumentInformation docInfo = pDoc.getDocumentInformation(); if (docInfo != null) { var author = docInfo.getAuthor(); var title = docInfo.getTitle(); var summary = docInfo.getSubject(); var keywords = docInfo.getKeywords(); result.Author = author; result.Title = title; result.Summary = summary; result.Keywords = keywords; } } catch (Exception ex) { } return(result); }
public override string[] ExtractKeyWordCandidatesFromFile() { string text = null; // If we have no bytes then we can't do anything. if (Bytes == null || Bytes.Length == 0) { // Log the problem. log.Error("Tried to extract creation date from empty bytes for file " + Name); return(null); } try { java.io.ByteArrayInputStream byteStream = new java.io.ByteArrayInputStream(Bytes); PDDocument doc = PDDocument.load(byteStream); // TODO Internationalize this conversion text = doc.getDocumentInformation().getKeywords(); } catch (Exception e) { log.Warn("Failed to get the keywords from the PDF file " + Name, e); } string[] returnText = null; if (!string.IsNullOrEmpty(text)) { returnText = text.Split(new char[] { ',', ';' }); } return(returnText); }
public static PDFInfo GetPDFDoucmentInformation(PDDocument document) { PDFInfo i = new PDFInfo(); PDDocumentInformation info = document.getDocumentInformation(); i.Author = info.getAuthor(); if (info.getCreationDate() != null) { DateTime dt = Utilities.Utils.GetDateFromJava(info.getCreationDate()); i.CreationDate = dt.ToLongDateString() + " " + dt.ToLongTimeString(); } i.Creator = info.getCreator(); i.Keywords = info.getKeywords(); if (info.getModificationDate() != null) { DateTime dt = Utilities.Utils.GetDateFromJava(info.getModificationDate()); i.ModificationDate = dt.ToLongDateString() + " " + dt.ToLongTimeString(); } i.Producer = info.getProducer(); i.Subject = info.getSubject(); i.Title = info.getTitle(); i.Trapped = info.getTrapped(); i.NumberOfPages = document.getNumberOfPages(); return i; }
internal PDFDocumentInformation GetDocumentInformation(PDDocument pdfDocument) { PDDocumentInformation documentInformation = pdfDocument.getDocumentInformation(); PDFDocumentInformation pDFDocumentInformation = new PDFDocumentInformation() { Author = documentInformation.getAuthor() }; documentInformation.getCreationDate(); pDFDocumentInformation.CreationDate = this.ConvertJavaDateToCSharp(documentInformation.getCreationDate()); pDFDocumentInformation.Creator = documentInformation.getCreator(); pDFDocumentInformation.Keywords = documentInformation.getKeywords(); pDFDocumentInformation.ModifiedDate = this.ConvertJavaDateToCSharp(documentInformation.getModificationDate()); pDFDocumentInformation.Producer = documentInformation.getProducer(); pDFDocumentInformation.Subject = documentInformation.getSubject(); pDFDocumentInformation.Title = documentInformation.getTitle(); pDFDocumentInformation.Trapped = documentInformation.getTrapped(); return(pDFDocumentInformation); }
public PDF2PDFaConverter(string fileName, string output, AquaforestPDFAFlavour PDFAFlavour) { try { Environment.SetEnvironmentVariable("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog"); int pDFAFlavour = (int)PDFAFlavour; this.SetFlavour(pDFAFlavour); this.PDFAFlavour = pDFAFlavour; this.outPutFileName = output; this.inputFileName = fileName; string tempPath = Path.GetTempPath(); Guid guid = Guid.NewGuid(); this.tempFileName = Path.Combine(tempPath, string.Concat("aquaforest\\pdftoolkit\\", guid.ToString(), "\\", Path.GetFileName(output))); string directoryName = Path.GetDirectoryName(this.tempFileName); if (!Directory.Exists(directoryName)) { Directory.CreateDirectory(directoryName); } string str = Path.Combine(directoryName, "password.pdf"); System.IO.File.Copy(this.inputFileName, str, true); PDDocument pDDocument = PDDocument.load(new java.io.File(str)); this.doc = new PDDocument(); switch (this.pdfaversion) { case 1: { if (this.doc.getDocument().getVersion() < 1.4f) { this.doc.getDocument().setVersion(1.4f); } break; } case 2: { if (this.doc.getDocument().getVersion() < 1.7f) { this.doc.getDocument().setVersion(1.7f); } break; } case 3: { if (this.doc.getDocument().getVersion() < 1.7f) { this.doc.getDocument().setVersion(1.7f); } break; } } foreach (PDPage page in pDDocument.getPages()) { this.doc.addPage(page); } this.doc.setDocumentInformation(pDDocument.getDocumentInformation()); this.doc.getDocumentCatalog().setDocumentOutline(pDDocument.getDocumentCatalog().getDocumentOutline()); this.doc.getDocumentCatalog().setAcroForm(pDDocument.getDocumentCatalog().getAcroForm()); this.doc.getDocumentCatalog().setLanguage(pDDocument.getDocumentCatalog().getLanguage()); this.doc.getDocumentCatalog().setMetadata(pDDocument.getDocumentCatalog().getMetadata()); this.doc.getDocumentCatalog().setPageLabels(pDDocument.getDocumentCatalog().getPageLabels()); this.doc.getDocumentCatalog().setViewerPreferences(pDDocument.getDocumentCatalog().getViewerPreferences()); this.doc.getDocumentCatalog().setPageMode(pDDocument.getDocumentCatalog().getPageMode()); this.doc.getDocumentCatalog().setPageLayout(pDDocument.getDocumentCatalog().getPageLayout()); this.doc.save(this.tempFileName); this.doc.close(); this.doc = PDDocument.load(new java.io.File(this.tempFileName)); if (pDDocument != null) { pDDocument.close(); pDDocument = null; } try { Assembly executingAssembly = Assembly.GetExecutingAssembly(); Stream manifestResourceStream = executingAssembly.GetManifestResourceStream("Aquaforest.PDF.sRGB_IEC61966-2-1_black_scaled.icc"); using (FileStream fileStream = System.IO.File.Create("sRGB_IEC61966-2-1_black_scaled.icc", (int)manifestResourceStream.Length)) { byte[] numArray = new byte[checked (manifestResourceStream.Length)]; manifestResourceStream.Read(numArray, 0, (int)numArray.Length); fileStream.Write(numArray, 0, (int)numArray.Length); } manifestResourceStream = executingAssembly.GetManifestResourceStream("Aquaforest.PDF.font.xml"); using (FileStream fileStream1 = System.IO.File.Create("font.xml", (int)manifestResourceStream.Length)) { byte[] numArray1 = new byte[checked (manifestResourceStream.Length)]; manifestResourceStream.Read(numArray1, 0, (int)numArray1.Length); fileStream1.Write(numArray1, 0, (int)numArray1.Length); } } catch { } this.iccString = "sRGB_IEC61966-2-1_black_scaled.icc"; } catch (Exception exception) { Environment.Exit(104); } }
public static Document ParseDocument(string filePath, GEN_FILE doc) { string author = null; string keywords = null; string summary = null; string text = null; try { PDFTextStripper stripper = new PDFTextStripper(); PDDocument document = PDDocument.load(filePath); text = stripper.getText(document); PDDocumentInformation info = document.getDocumentInformation(); author = info.getAuthor(); keywords = info.getKeywords(); summary = info.getSubject(); document.close(); } catch (Exception ex) { Debug.WriteLine("Exception in reading file: " + filePath + " ex: " + ex.Message); } Document lucDoc = new Document(); string filename = Path.GetFileNameWithoutExtension(doc.File_Name); string short_name = doc.Short_Name; string title = doc.Title; string header = doc.Summary; string doc_id = doc.Gen_File_Id.ToString(); Debug.WriteLine("DocID: " + doc_id); StringBuilder keyTextBuilder = new StringBuilder(); foreach (FILE_KEYWORDS keywordobj in doc.FILE_KEYWORDS.ToList()) { keyTextBuilder.Append(keywordobj.Keyword + " "); } string keyword = keyTextBuilder.ToString(); lucDoc.Add(new Field(FieldNames.FILE_NAME, filename, Field.Store.YES, Field.Index.ANALYZED)); if (author != null && author.Trim() != "") { lucDoc.Add(new Field(FieldNames.AUTHOR, author, Field.Store.YES, Field.Index.ANALYZED)); } if (keywords != null && keywords.Trim() != "") { lucDoc.Add(new Field(FieldNames.KEYWORDS, keywords, Field.Store.YES, Field.Index.ANALYZED)); } if (summary != null && summary.Trim() != "") { lucDoc.Add(new Field(FieldNames.SUMMARY, summary, Field.Store.YES, Field.Index.ANALYZED)); } lucDoc.Add(new Field(FieldNames.SHORT_NAME, short_name, Field.Store.YES, Field.Index.ANALYZED)); lucDoc.Add(new Field(FieldNames.TITLE, title, Field.Store.YES, Field.Index.ANALYZED)); if (!String.IsNullOrWhiteSpace(header)) { lucDoc.Add(new Field(FieldNames.HEADER, header, Field.Store.YES, Field.Index.ANALYZED)); } if (text != null && text.Trim() != "") { lucDoc.Add(new Field(FieldNames.TEXT, text, Field.Store.YES, Field.Index.ANALYZED)); } lucDoc.Add(new Field(FieldNames.DOC_ID, doc_id, Field.Store.YES, Field.Index.NO)); lucDoc.Add(new Field(FieldNames.RESOURCE_TYPE, ResourceTypeEnum.Resource_Doc.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); return(lucDoc); }