예제 #1
0
        public static Dictionary <int, string> Extract(string pdfFileName)
        {
            if (!File.Exists(pdfFileName))
            {
                throw new FileNotFoundException("pdfFileName");
            }

            var        result      = new Dictionary <int, string>();
            PDDocument pdfDocument = PDDocument.load(pdfFileName);

            var pdfStripper = new PDFTextStripper();

            pdfStripper.setPageSeparator(Environment.NewLine + Environment.NewLine);

            for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
            {
                pdfStripper.setStartPage(i);
                pdfStripper.setEndPage(i);

                result.Add(i, GetText(pdfStripper, pdfDocument));
            }

            pdfDocument.close();
            return(result);
        }
예제 #2
0
        public static Dictionary <int, string> Extract(string pdfFileName)
        {
            if (!File.Exists(pdfFileName))
            {
                throw new FileNotFoundException("pdfFileName");
            }

            var        result      = new Dictionary <int, string>();
            PDDocument pdfDocument = PDDocument.load(pdfFileName);

            var pdfStripper = new PDFTextStripper();

            pdfStripper.setPageSeparator(Environment.NewLine + Environment.NewLine);

            for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
            {
                pdfStripper.setStartPage(i);
                pdfStripper.setEndPage(i);

                //ExtractText(pdfStripper, pdfDocument,
                //  string.Format(@"c:\Users\tri.hoang\Desktop\temp\epub-belastingblad\2014-08\pdf\page_{0}.txt", i.ToString().PadLeft(5, '0')));

                result.Add(i, GetText(pdfStripper, pdfDocument));
            }

            pdfDocument.close();
            return(result);
        }
예제 #3
0
        /// <summary>
        /// Get a thumbnail of the document, if possible
        /// </summary>
        /// <param name="sizeX">The maximum X size of the thumbnail</param>
        /// <param name="sizeY">The maximum y size of the thumbnail</param>
        /// <param name="forceFullSize">True if the thumbnail should be exatly XxY pixels and False if the thumbnail
        /// should fit inside a XxY box but should maintain its aspect ratio</param>
        /// <returns>A JPEG byte thumbnail or null if the thumbnail can´t be generated</returns>
        public override byte[] GetThumbnail(int sizeX, int sizeY, bool forceFullSize)
        {
            // If we have no bytes then we can't do anything.
            if (Bytes == null || Bytes.Length == 0)
            {
                return(null);
            }

            try
            {
                org.pdfbox.pdfviewer.PageDrawer pagedrawer = new
                                                             org.pdfbox.pdfviewer.PageDrawer();

                java.io.ByteArrayInputStream byteStream = new java.io.ByteArrayInputStream(Bytes);
                PDDocument     doc   = PDDocument.load(byteStream);
                int            count = doc.getNumberOfPages();
                java.util.List pages = doc.getDocumentCatalog().getAllPages();
                if (pages.size() > 0)
                {
                    PDPage page = pagedrawer.getPage();
                    java.awt.image.BufferedImage  image = page.convertToImage();
                    java.io.ByteArrayOutputStream os    = new java.io.ByteArrayOutputStream();
                    ImageIO.write(image, "jpg", os);
                    byte[] data = os.toByteArray();
                    return(data);
                }
            }
            catch (Exception e)
            {
                log.Error("Failed to get the thumbnail from the PDF file " + Name, e);
            }

            return(null);
        }
예제 #4
0
        private void CheckAllPages(PDDocument doc)
        {
            int num = 0;

            try
            {
                this.NumberOfPagesDict = doc.getNumberOfPages();
                foreach (PDPage page in doc.getPages())
                {
                    if (page.getMediaBox() == null)
                    {
                        this.ErrorMessage = string.Format("Page number {0} has no media box", num);
                        this.IsValid      = false;
                    }
                    if (page.getResources() == null)
                    {
                        this.ErrorMessage = string.Format("Page number {0}, has no page resources", num);
                        this.IsValid      = false;
                    }
                    num++;
                }
                if (this.NumberOfPagesDict != num)
                {
                    this.ErrorMessage = string.Format("Page Number Mismatch between dictionary and actual document", new object[0]);
                    this.IsValid      = false;
                }
            }
            catch (Exception exception1)
            {
                Exception exception = exception1;
                this.ErrorMessage = string.Format("PDF analysis failed on page number {0},\nWith exception {1}", num, exception.Message);
                this.IsValid      = false;
            }
        }
예제 #5
0
파일: PDFHocr.cs 프로젝트: jehan2898/root
        internal static bool CreateHocrFileFromPDF(PDDocument document, string outputfile, bool useWords)
        {
            bool flag;

            try
            {
                PDFHocr pDFHocr = new PDFHocr()
                {
                    getHOCRByWords = useWords
                };
                pDFHocr.setSortByPosition(true);
                pDFHocr.setStartPage(0);
                pDFHocr.setEndPage(document.getNumberOfPages());
                PDFHelper.DisplayTrialPopupIfNecessary();
                if (PDFHelper.AddStamp)
                {
                    pDFHocr.setEndPage(3);
                }
                pDFHocr.writeText(document, new OutputStreamWriter(new ByteArrayOutputStream()));
                if ((pDFHocr.lineList == null ? false : pDFHocr.lineList.Count > 0))
                {
                    HocrPageModel hocrPageModel = new HocrPageModel();
                    hocrPageModel.Lines.AddRange(pDFHocr.SortLineList(pDFHocr.lineList));
                    pDFHocr.pageList.Add(hocrPageModel);
                    pDFHocr.lineList.Clear();
                }
                pDFHocr.GetHocrFromPageList(pDFHocr.pageList, outputfile);
                flag = true;
            }
            catch (Exception exception)
            {
                flag = false;
            }
            return(flag);
        }
예제 #6
0
파일: PDFHocr.cs 프로젝트: jehan2898/root
        internal static List <HocrPageModel> GetPageWordDetails(PDDocument document)
        {
            List <HocrPageModel> hocrPageModels;

            try
            {
                PDFHocr pDFHocr = new PDFHocr();
                pDFHocr.setSortByPosition(true);
                pDFHocr.setStartPage(0);
                pDFHocr.setEndPage(document.getNumberOfPages());
                Writer outputStreamWriter = new OutputStreamWriter(new ByteArrayOutputStream());
                PDFHelper.DisplayTrialPopupIfNecessary();
                if (PDFHelper.AddStamp)
                {
                    pDFHocr.setEndPage(3);
                }
                pDFHocr.writeText(document, outputStreamWriter);
                if ((pDFHocr.lineList == null ? false : pDFHocr.lineList.Count > 0))
                {
                    HocrPageModel hocrPageModel = new HocrPageModel();
                    hocrPageModel.Lines.AddRange(pDFHocr.SortLineList(pDFHocr.lineList));
                    pDFHocr.pageList.Add(hocrPageModel);
                    pDFHocr.lineList.Clear();
                }
                hocrPageModels = pDFHocr.pageList;
            }
            catch (Exception exception)
            {
                hocrPageModels = null;
            }
            return(hocrPageModels);
        }
예제 #7
0
파일: PDFHelper.cs 프로젝트: jehan2898/root
        internal static bool AddBookmarkTooutline(PDFBookmarkItem bookmarentry, PDDocument document, PDOutlineItem outline)
        {
            bool flag;

            try
            {
                if (bookmarentry.BookMarkPage <= document.getNumberOfPages())
                {
                    PDPage page = document.getPage(bookmarentry.BookMarkPage - 1);
                    PDPageFitWidthDestination pDPageFitWidthDestination = new PDPageFitWidthDestination();
                    pDPageFitWidthDestination.setPage(page);
                    outline.setDestination(pDPageFitWidthDestination);
                    outline.setTitle(bookmarentry.BookmarkTitle);
                }
                if ((bookmarentry.BookmarkItems == null ? false : bookmarentry.BookmarkItems.Count > 0))
                {
                    foreach (PDFBookmarkItem bookmarkItem in bookmarentry.BookmarkItems)
                    {
                        PDOutlineItem pDOutlineItem = new PDOutlineItem();
                        PDFHelper.AddBookmarkTooutline(bookmarkItem, document, pDOutlineItem);
                        outline.addLast(pDOutlineItem);
                    }
                }
                flag = true;
            }
            catch (Exception exception)
            {
                flag = false;
            }
            return(flag);
        }
        public static PDFInfo GetPDFDoucmentInformation(PDDocument document)
        {
            PDFInfo i = new PDFInfo();
            PDDocumentInformation info = document.getDocumentInformation();
            i.Author = info.getAuthor();

            if (info.getCreationDate() != null)
            {
                DateTime dt = Utilities.Utils.GetDateFromJava(info.getCreationDate());
                i.CreationDate = dt.ToLongDateString() + " " + dt.ToLongTimeString();
            }

            i.Creator = info.getCreator();
            i.Keywords = info.getKeywords();

            if (info.getModificationDate() != null)
            {
                DateTime dt = Utilities.Utils.GetDateFromJava(info.getModificationDate());
                i.ModificationDate = dt.ToLongDateString() + " " + dt.ToLongTimeString();
            }

            i.Producer = info.getProducer();
            i.Subject = info.getSubject();
            i.Title = info.getTitle();
            i.Trapped = info.getTrapped();
            i.NumberOfPages = document.getNumberOfPages();
            return i;
        }
예제 #9
0
        private void Run()
        {
            PDDocument doc = PDDocument.load(_path);

            for (int i = 1; i <= doc.getNumberOfPages(); i++)
            {
                ExtractText et = new ExtractText();
                et.Extract(doc, i);
                TextPosition.Add(i, et._nodes);
                Pages.Add(i, et._text);
            }
            if (Pages == null)
            {
                return;
            }

            for (int i = 1; i <= Pages.Keys.Count; i++)
            {
                Content += Pages[i];
            }
        }
예제 #10
0
 public void Process(int pageNum, File file)
 {
     try
     {
         PDDocument document = PDDocument.load(file);
         //              PDFTextStripper stripper = new TextPositonExtracter();
         if (pageNum == -1)
         {
             this.setStartPage(1);
             this.setEndPage(document.getNumberOfPages());
         }
         else
         {
             this.setStartPage(pageNum);
             this.setEndPage(pageNum);
         }
         Writer dumpy = new OutputStreamWriter(new ByteArrayOutputStream());
         this.writeText(document, dumpy);
     }
     catch
     {
     }
 }