public static Dictionary <int, string> Extract(string pdfFileName) { if (!File.Exists(pdfFileName)) { throw new FileNotFoundException("pdfFileName"); } var result = new Dictionary <int, string>(); PDDocument pdfDocument = PDDocument.load(pdfFileName); var pdfStripper = new PDFTextStripper(); pdfStripper.setPageSeparator(Environment.NewLine + Environment.NewLine); for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) { pdfStripper.setStartPage(i); pdfStripper.setEndPage(i); result.Add(i, GetText(pdfStripper, pdfDocument)); } pdfDocument.close(); return(result); }
public static Dictionary <int, string> Extract(string pdfFileName) { if (!File.Exists(pdfFileName)) { throw new FileNotFoundException("pdfFileName"); } var result = new Dictionary <int, string>(); PDDocument pdfDocument = PDDocument.load(pdfFileName); var pdfStripper = new PDFTextStripper(); pdfStripper.setPageSeparator(Environment.NewLine + Environment.NewLine); for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) { pdfStripper.setStartPage(i); pdfStripper.setEndPage(i); //ExtractText(pdfStripper, pdfDocument, // string.Format(@"c:\Users\tri.hoang\Desktop\temp\epub-belastingblad\2014-08\pdf\page_{0}.txt", i.ToString().PadLeft(5, '0'))); result.Add(i, GetText(pdfStripper, pdfDocument)); } pdfDocument.close(); return(result); }
/// <summary> /// Get a thumbnail of the document, if possible /// </summary> /// <param name="sizeX">The maximum X size of the thumbnail</param> /// <param name="sizeY">The maximum y size of the thumbnail</param> /// <param name="forceFullSize">True if the thumbnail should be exatly XxY pixels and False if the thumbnail /// should fit inside a XxY box but should maintain its aspect ratio</param> /// <returns>A JPEG byte thumbnail or null if the thumbnail can´t be generated</returns> public override byte[] GetThumbnail(int sizeX, int sizeY, bool forceFullSize) { // If we have no bytes then we can't do anything. if (Bytes == null || Bytes.Length == 0) { return(null); } try { org.pdfbox.pdfviewer.PageDrawer pagedrawer = new org.pdfbox.pdfviewer.PageDrawer(); java.io.ByteArrayInputStream byteStream = new java.io.ByteArrayInputStream(Bytes); PDDocument doc = PDDocument.load(byteStream); int count = doc.getNumberOfPages(); java.util.List pages = doc.getDocumentCatalog().getAllPages(); if (pages.size() > 0) { PDPage page = pagedrawer.getPage(); java.awt.image.BufferedImage image = page.convertToImage(); java.io.ByteArrayOutputStream os = new java.io.ByteArrayOutputStream(); ImageIO.write(image, "jpg", os); byte[] data = os.toByteArray(); return(data); } } catch (Exception e) { log.Error("Failed to get the thumbnail from the PDF file " + Name, e); } return(null); }
private void CheckAllPages(PDDocument doc) { int num = 0; try { this.NumberOfPagesDict = doc.getNumberOfPages(); foreach (PDPage page in doc.getPages()) { if (page.getMediaBox() == null) { this.ErrorMessage = string.Format("Page number {0} has no media box", num); this.IsValid = false; } if (page.getResources() == null) { this.ErrorMessage = string.Format("Page number {0}, has no page resources", num); this.IsValid = false; } num++; } if (this.NumberOfPagesDict != num) { this.ErrorMessage = string.Format("Page Number Mismatch between dictionary and actual document", new object[0]); this.IsValid = false; } } catch (Exception exception1) { Exception exception = exception1; this.ErrorMessage = string.Format("PDF analysis failed on page number {0},\nWith exception {1}", num, exception.Message); this.IsValid = false; } }
internal static bool CreateHocrFileFromPDF(PDDocument document, string outputfile, bool useWords) { bool flag; try { PDFHocr pDFHocr = new PDFHocr() { getHOCRByWords = useWords }; pDFHocr.setSortByPosition(true); pDFHocr.setStartPage(0); pDFHocr.setEndPage(document.getNumberOfPages()); PDFHelper.DisplayTrialPopupIfNecessary(); if (PDFHelper.AddStamp) { pDFHocr.setEndPage(3); } pDFHocr.writeText(document, new OutputStreamWriter(new ByteArrayOutputStream())); if ((pDFHocr.lineList == null ? false : pDFHocr.lineList.Count > 0)) { HocrPageModel hocrPageModel = new HocrPageModel(); hocrPageModel.Lines.AddRange(pDFHocr.SortLineList(pDFHocr.lineList)); pDFHocr.pageList.Add(hocrPageModel); pDFHocr.lineList.Clear(); } pDFHocr.GetHocrFromPageList(pDFHocr.pageList, outputfile); flag = true; } catch (Exception exception) { flag = false; } return(flag); }
internal static List <HocrPageModel> GetPageWordDetails(PDDocument document) { List <HocrPageModel> hocrPageModels; try { PDFHocr pDFHocr = new PDFHocr(); pDFHocr.setSortByPosition(true); pDFHocr.setStartPage(0); pDFHocr.setEndPage(document.getNumberOfPages()); Writer outputStreamWriter = new OutputStreamWriter(new ByteArrayOutputStream()); PDFHelper.DisplayTrialPopupIfNecessary(); if (PDFHelper.AddStamp) { pDFHocr.setEndPage(3); } pDFHocr.writeText(document, outputStreamWriter); if ((pDFHocr.lineList == null ? false : pDFHocr.lineList.Count > 0)) { HocrPageModel hocrPageModel = new HocrPageModel(); hocrPageModel.Lines.AddRange(pDFHocr.SortLineList(pDFHocr.lineList)); pDFHocr.pageList.Add(hocrPageModel); pDFHocr.lineList.Clear(); } hocrPageModels = pDFHocr.pageList; } catch (Exception exception) { hocrPageModels = null; } return(hocrPageModels); }
internal static bool AddBookmarkTooutline(PDFBookmarkItem bookmarentry, PDDocument document, PDOutlineItem outline) { bool flag; try { if (bookmarentry.BookMarkPage <= document.getNumberOfPages()) { PDPage page = document.getPage(bookmarentry.BookMarkPage - 1); PDPageFitWidthDestination pDPageFitWidthDestination = new PDPageFitWidthDestination(); pDPageFitWidthDestination.setPage(page); outline.setDestination(pDPageFitWidthDestination); outline.setTitle(bookmarentry.BookmarkTitle); } if ((bookmarentry.BookmarkItems == null ? false : bookmarentry.BookmarkItems.Count > 0)) { foreach (PDFBookmarkItem bookmarkItem in bookmarentry.BookmarkItems) { PDOutlineItem pDOutlineItem = new PDOutlineItem(); PDFHelper.AddBookmarkTooutline(bookmarkItem, document, pDOutlineItem); outline.addLast(pDOutlineItem); } } flag = true; } catch (Exception exception) { flag = false; } return(flag); }
public static PDFInfo GetPDFDoucmentInformation(PDDocument document) { PDFInfo i = new PDFInfo(); PDDocumentInformation info = document.getDocumentInformation(); i.Author = info.getAuthor(); if (info.getCreationDate() != null) { DateTime dt = Utilities.Utils.GetDateFromJava(info.getCreationDate()); i.CreationDate = dt.ToLongDateString() + " " + dt.ToLongTimeString(); } i.Creator = info.getCreator(); i.Keywords = info.getKeywords(); if (info.getModificationDate() != null) { DateTime dt = Utilities.Utils.GetDateFromJava(info.getModificationDate()); i.ModificationDate = dt.ToLongDateString() + " " + dt.ToLongTimeString(); } i.Producer = info.getProducer(); i.Subject = info.getSubject(); i.Title = info.getTitle(); i.Trapped = info.getTrapped(); i.NumberOfPages = document.getNumberOfPages(); return i; }
private void Run() { PDDocument doc = PDDocument.load(_path); for (int i = 1; i <= doc.getNumberOfPages(); i++) { ExtractText et = new ExtractText(); et.Extract(doc, i); TextPosition.Add(i, et._nodes); Pages.Add(i, et._text); } if (Pages == null) { return; } for (int i = 1; i <= Pages.Keys.Count; i++) { Content += Pages[i]; } }
public void Process(int pageNum, File file) { try { PDDocument document = PDDocument.load(file); // PDFTextStripper stripper = new TextPositonExtracter(); if (pageNum == -1) { this.setStartPage(1); this.setEndPage(document.getNumberOfPages()); } else { this.setStartPage(pageNum); this.setEndPage(pageNum); } Writer dumpy = new OutputStreamWriter(new ByteArrayOutputStream()); this.writeText(document, dumpy); } catch { } }