Пример #1
0
        public PdfOcrResult Execute(byte[] bytes)
        {
            PDDocument document = null;

            try
            {
                LoadPdf(bytes, ref document);

                List allPages = document.getDocumentCatalog().getAllPages();
                if (allPages.size() == 0)
                {
                    throw new PdfNotReadableException("Pdf contains no readable content");
                }

                //only first page
                PDPage page = (PDPage)allPages.get(0);

                PDStream contents = page.getContents();
                if (contents == null)
                {
                    throw new PdfNotReadableException("Pdf contains no readable content");
                }

                var items = new PdfToCharacters().GetItems(page, page.findResources(), page.getContents().getStream());
                if (items.Count == 0)
                {
                    throw new PdfNotReadableException("Pdf contains no readable content");
                }

                var mediaBox = page.findMediaBox();

                var height     = mediaBox?.getHeight() ?? 0;
                var width      = mediaBox?.getWidth() ?? 0;
                var itemsArray = items.ToArray();

                var keywords = "";
                try
                {
                    keywords = document.getDocumentInformation()?.getKeywords();
                }
                catch (Exception) { } // we do not know if PDF box can fail on this, if there is no keywords or something else. We dont really care we just want the keywords if possible.

                return(new PdfOcrResult()
                {
                    Items = itemsArray, Height = height, Width = width, Keywords = keywords
                });
            }
            catch (PdfReadException)
            {
                throw;
            }
            catch (Exception e)
            {
                throw new PdfReadException("Pdf could not be loaded. It is not a redable pdf.", e);
            }
            finally
            {
                document?.close();
            }
        }
Пример #2
0
        public static Info ReadDocInfo(string fileName)
        {
            Info result = new Info();

            try
            {
                PDDocument pDoc = PDDocument.load(fileName);

                PDDocumentInformation docInfo = pDoc.getDocumentInformation();

                if (docInfo != null)
                {
                    var author   = docInfo.getAuthor();
                    var title    = docInfo.getTitle();
                    var summary  = docInfo.getSubject();
                    var keywords = docInfo.getKeywords();

                    result.Author   = author;
                    result.Title    = title;
                    result.Summary  = summary;
                    result.Keywords = keywords;
                }
            }
            catch (Exception ex)
            {
            }
            return(result);
        }
Пример #3
0
        public override string[] ExtractKeyWordCandidatesFromFile()
        {
            string text = null;

            // If we have no bytes then we can't do anything.
            if (Bytes == null || Bytes.Length == 0)
            {
                // Log the problem.
                log.Error("Tried to extract creation date from empty bytes for file " + Name);
                return(null);
            }

            try
            {
                java.io.ByteArrayInputStream byteStream = new java.io.ByteArrayInputStream(Bytes);
                PDDocument doc = PDDocument.load(byteStream);

                // TODO Internationalize this conversion
                text = doc.getDocumentInformation().getKeywords();
            }
            catch (Exception e)
            {
                log.Warn("Failed to get the keywords from the PDF file " + Name, e);
            }

            string[] returnText = null;

            if (!string.IsNullOrEmpty(text))
            {
                returnText = text.Split(new char[] { ',', ';' });
            }

            return(returnText);
        }
        public static PDFInfo GetPDFDoucmentInformation(PDDocument document)
        {
            PDFInfo i = new PDFInfo();
            PDDocumentInformation info = document.getDocumentInformation();
            i.Author = info.getAuthor();

            if (info.getCreationDate() != null)
            {
                DateTime dt = Utilities.Utils.GetDateFromJava(info.getCreationDate());
                i.CreationDate = dt.ToLongDateString() + " " + dt.ToLongTimeString();
            }

            i.Creator = info.getCreator();
            i.Keywords = info.getKeywords();

            if (info.getModificationDate() != null)
            {
                DateTime dt = Utilities.Utils.GetDateFromJava(info.getModificationDate());
                i.ModificationDate = dt.ToLongDateString() + " " + dt.ToLongTimeString();
            }

            i.Producer = info.getProducer();
            i.Subject = info.getSubject();
            i.Title = info.getTitle();
            i.Trapped = info.getTrapped();
            i.NumberOfPages = document.getNumberOfPages();
            return i;
        }
Пример #5
0
        internal PDFDocumentInformation GetDocumentInformation(PDDocument pdfDocument)
        {
            PDDocumentInformation  documentInformation    = pdfDocument.getDocumentInformation();
            PDFDocumentInformation pDFDocumentInformation = new PDFDocumentInformation()
            {
                Author = documentInformation.getAuthor()
            };

            documentInformation.getCreationDate();
            pDFDocumentInformation.CreationDate = this.ConvertJavaDateToCSharp(documentInformation.getCreationDate());
            pDFDocumentInformation.Creator      = documentInformation.getCreator();
            pDFDocumentInformation.Keywords     = documentInformation.getKeywords();
            pDFDocumentInformation.ModifiedDate = this.ConvertJavaDateToCSharp(documentInformation.getModificationDate());
            pDFDocumentInformation.Producer     = documentInformation.getProducer();
            pDFDocumentInformation.Subject      = documentInformation.getSubject();
            pDFDocumentInformation.Title        = documentInformation.getTitle();
            pDFDocumentInformation.Trapped      = documentInformation.getTrapped();
            return(pDFDocumentInformation);
        }
Пример #6
0
        public PDF2PDFaConverter(string fileName, string output, AquaforestPDFAFlavour PDFAFlavour)
        {
            try
            {
                Environment.SetEnvironmentVariable("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog");
                int pDFAFlavour = (int)PDFAFlavour;
                this.SetFlavour(pDFAFlavour);
                this.PDFAFlavour    = pDFAFlavour;
                this.outPutFileName = output;
                this.inputFileName  = fileName;
                string tempPath = Path.GetTempPath();
                Guid   guid     = Guid.NewGuid();
                this.tempFileName = Path.Combine(tempPath, string.Concat("aquaforest\\pdftoolkit\\", guid.ToString(), "\\", Path.GetFileName(output)));
                string directoryName = Path.GetDirectoryName(this.tempFileName);
                if (!Directory.Exists(directoryName))
                {
                    Directory.CreateDirectory(directoryName);
                }
                string str = Path.Combine(directoryName, "password.pdf");
                System.IO.File.Copy(this.inputFileName, str, true);
                PDDocument pDDocument = PDDocument.load(new java.io.File(str));
                this.doc = new PDDocument();
                switch (this.pdfaversion)
                {
                case 1:
                {
                    if (this.doc.getDocument().getVersion() < 1.4f)
                    {
                        this.doc.getDocument().setVersion(1.4f);
                    }
                    break;
                }

                case 2:
                {
                    if (this.doc.getDocument().getVersion() < 1.7f)
                    {
                        this.doc.getDocument().setVersion(1.7f);
                    }
                    break;
                }

                case 3:
                {
                    if (this.doc.getDocument().getVersion() < 1.7f)
                    {
                        this.doc.getDocument().setVersion(1.7f);
                    }
                    break;
                }
                }
                foreach (PDPage page in pDDocument.getPages())
                {
                    this.doc.addPage(page);
                }
                this.doc.setDocumentInformation(pDDocument.getDocumentInformation());
                this.doc.getDocumentCatalog().setDocumentOutline(pDDocument.getDocumentCatalog().getDocumentOutline());
                this.doc.getDocumentCatalog().setAcroForm(pDDocument.getDocumentCatalog().getAcroForm());
                this.doc.getDocumentCatalog().setLanguage(pDDocument.getDocumentCatalog().getLanguage());
                this.doc.getDocumentCatalog().setMetadata(pDDocument.getDocumentCatalog().getMetadata());
                this.doc.getDocumentCatalog().setPageLabels(pDDocument.getDocumentCatalog().getPageLabels());
                this.doc.getDocumentCatalog().setViewerPreferences(pDDocument.getDocumentCatalog().getViewerPreferences());
                this.doc.getDocumentCatalog().setPageMode(pDDocument.getDocumentCatalog().getPageMode());
                this.doc.getDocumentCatalog().setPageLayout(pDDocument.getDocumentCatalog().getPageLayout());
                this.doc.save(this.tempFileName);
                this.doc.close();
                this.doc = PDDocument.load(new java.io.File(this.tempFileName));
                if (pDDocument != null)
                {
                    pDDocument.close();
                    pDDocument = null;
                }
                try
                {
                    Assembly executingAssembly      = Assembly.GetExecutingAssembly();
                    Stream   manifestResourceStream = executingAssembly.GetManifestResourceStream("Aquaforest.PDF.sRGB_IEC61966-2-1_black_scaled.icc");
                    using (FileStream fileStream = System.IO.File.Create("sRGB_IEC61966-2-1_black_scaled.icc", (int)manifestResourceStream.Length))
                    {
                        byte[] numArray = new byte[checked (manifestResourceStream.Length)];
                        manifestResourceStream.Read(numArray, 0, (int)numArray.Length);
                        fileStream.Write(numArray, 0, (int)numArray.Length);
                    }
                    manifestResourceStream = executingAssembly.GetManifestResourceStream("Aquaforest.PDF.font.xml");
                    using (FileStream fileStream1 = System.IO.File.Create("font.xml", (int)manifestResourceStream.Length))
                    {
                        byte[] numArray1 = new byte[checked (manifestResourceStream.Length)];
                        manifestResourceStream.Read(numArray1, 0, (int)numArray1.Length);
                        fileStream1.Write(numArray1, 0, (int)numArray1.Length);
                    }
                }
                catch
                {
                }
                this.iccString = "sRGB_IEC61966-2-1_black_scaled.icc";
            }
            catch (Exception exception)
            {
                Environment.Exit(104);
            }
        }
Пример #7
0
        public static Document ParseDocument(string filePath, GEN_FILE doc)
        {
            string author   = null;
            string keywords = null;
            string summary  = null;
            string text     = null;

            try
            {
                PDFTextStripper stripper = new PDFTextStripper();
                PDDocument      document = PDDocument.load(filePath);
                text = stripper.getText(document);
                PDDocumentInformation info = document.getDocumentInformation();
                author   = info.getAuthor();
                keywords = info.getKeywords();
                summary  = info.getSubject();
                document.close();
            }
            catch (Exception ex)
            {
                Debug.WriteLine("Exception in reading file: " + filePath + " ex: " + ex.Message);
            }
            Document lucDoc     = new Document();
            string   filename   = Path.GetFileNameWithoutExtension(doc.File_Name);
            string   short_name = doc.Short_Name;
            string   title      = doc.Title;
            string   header     = doc.Summary;
            string   doc_id     = doc.Gen_File_Id.ToString();

            Debug.WriteLine("DocID: " + doc_id);

            StringBuilder keyTextBuilder = new StringBuilder();

            foreach (FILE_KEYWORDS keywordobj in doc.FILE_KEYWORDS.ToList())
            {
                keyTextBuilder.Append(keywordobj.Keyword + " ");
            }
            string keyword = keyTextBuilder.ToString();

            lucDoc.Add(new Field(FieldNames.FILE_NAME, filename, Field.Store.YES, Field.Index.ANALYZED));
            if (author != null && author.Trim() != "")
            {
                lucDoc.Add(new Field(FieldNames.AUTHOR, author, Field.Store.YES, Field.Index.ANALYZED));
            }

            if (keywords != null && keywords.Trim() != "")
            {
                lucDoc.Add(new Field(FieldNames.KEYWORDS, keywords, Field.Store.YES, Field.Index.ANALYZED));
            }

            if (summary != null && summary.Trim() != "")
            {
                lucDoc.Add(new Field(FieldNames.SUMMARY, summary, Field.Store.YES, Field.Index.ANALYZED));
            }
            lucDoc.Add(new Field(FieldNames.SHORT_NAME, short_name, Field.Store.YES, Field.Index.ANALYZED));
            lucDoc.Add(new Field(FieldNames.TITLE, title, Field.Store.YES, Field.Index.ANALYZED));

            if (!String.IsNullOrWhiteSpace(header))
            {
                lucDoc.Add(new Field(FieldNames.HEADER, header, Field.Store.YES, Field.Index.ANALYZED));
            }
            if (text != null && text.Trim() != "")
            {
                lucDoc.Add(new Field(FieldNames.TEXT, text, Field.Store.YES, Field.Index.ANALYZED));
            }

            lucDoc.Add(new Field(FieldNames.DOC_ID, doc_id, Field.Store.YES, Field.Index.NO));
            lucDoc.Add(new Field(FieldNames.RESOURCE_TYPE, ResourceTypeEnum.Resource_Doc.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));


            return(lucDoc);
        }