コード例 #1
0
ファイル: PDFMerger.cs プロジェクト: jehan2898/root
 public void MergePDFs(List <string> sourcePDFs, string outputFile)
 {
     PDFHelper.DisplayTrialPopupIfNecessary();
     foreach (string sourcePDF in sourcePDFs)
     {
         this.mergeUtility.addSource(sourcePDF);
     }
     if (!PDFHelper.AddStamp)
     {
         this.mergeUtility.setDestinationFileName(outputFile);
         this.mergeUtility.mergeDocuments();
     }
     else
     {
         string str = Path.Combine(Path.GetTempPath(), string.Concat(Path.GetRandomFileName(), ".pdf"));
         this.mergeUtility.setDestinationFileName(str);
         this.mergeUtility.mergeDocuments();
         PDDocument pDDocument = PDDocument.load(new java.io.File(str));
         pDDocument = PDFHelper.AddTrialStampIfNecessary(pDDocument);
         pDDocument.save(outputFile);
         if (pDDocument != null)
         {
             pDDocument.close();
         }
         if (System.IO.File.Exists(str))
         {
             System.IO.File.Delete(str);
         }
     }
 }
コード例 #2
0
        public static Dictionary <int, string> Extract(string pdfFileName)
        {
            if (!File.Exists(pdfFileName))
            {
                throw new FileNotFoundException("pdfFileName");
            }

            var        result      = new Dictionary <int, string>();
            PDDocument pdfDocument = PDDocument.load(pdfFileName);

            var pdfStripper = new PDFTextStripper();

            pdfStripper.setPageSeparator(Environment.NewLine + Environment.NewLine);

            for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
            {
                pdfStripper.setStartPage(i);
                pdfStripper.setEndPage(i);

                result.Add(i, GetText(pdfStripper, pdfDocument));
            }

            pdfDocument.close();
            return(result);
        }
コード例 #3
0
        /// <summary>
        /// Validates the PDF File Downloaded and the text present in the file
        /// </summary>
        /// <param name="pdfFileName">The folder or filepath.</param>
        /// <param name="textToCheck">The text to be validated.</param>
        /// <returns>if the text is present or not</returns>
        public static bool ExtractAndValidateTextFromPDF(string pdfFileName, string textToCheck)
        {
            try
            {
                string          result       = string.Empty;
                bool            validateText = false;
                PDDocument      doc          = PDDocument.load(pdfFileName);
                PDFTextStripper stripper     = new PDFTextStripper();
                result = stripper.getText(doc);
                doc.close();

                if (result.Length != 0)
                {
                    byte[] bytes            = Encoding.Default.GetBytes(result.ToString());
                    string decodedresult    = Encoding.UTF8.GetString(bytes);
                    string outputPDF        = decodedresult.Replace(" ", null).Replace("\r", null).Replace("\n", null);
                    string validationString = textToCheck.Replace(" ", null);
                    validateText = outputPDF.Contains(validationString);
                }

                return(validateText);
            }
            catch (Exception)
            {
                throw;
            }
        }
コード例 #4
0
        /// <summary>
        /// Get a thumbnail of the document, if possible
        /// </summary>
        /// <param name="sizeX">The maximum X size of the thumbnail</param>
        /// <param name="sizeY">The maximum y size of the thumbnail</param>
        /// <param name="forceFullSize">True if the thumbnail should be exatly XxY pixels and False if the thumbnail
        /// should fit inside a XxY box but should maintain its aspect ratio</param>
        /// <returns>A JPEG byte thumbnail or null if the thumbnail can´t be generated</returns>
        public override byte[] GetThumbnail(int sizeX, int sizeY, bool forceFullSize)
        {
            // If we have no bytes then we can't do anything.
            if (Bytes == null || Bytes.Length == 0)
            {
                return(null);
            }

            try
            {
                org.pdfbox.pdfviewer.PageDrawer pagedrawer = new
                                                             org.pdfbox.pdfviewer.PageDrawer();

                java.io.ByteArrayInputStream byteStream = new java.io.ByteArrayInputStream(Bytes);
                PDDocument     doc   = PDDocument.load(byteStream);
                int            count = doc.getNumberOfPages();
                java.util.List pages = doc.getDocumentCatalog().getAllPages();
                if (pages.size() > 0)
                {
                    PDPage page = pagedrawer.getPage();
                    java.awt.image.BufferedImage  image = page.convertToImage();
                    java.io.ByteArrayOutputStream os    = new java.io.ByteArrayOutputStream();
                    ImageIO.write(image, "jpg", os);
                    byte[] data = os.toByteArray();
                    return(data);
                }
            }
            catch (Exception e)
            {
                log.Error("Failed to get the thumbnail from the PDF file " + Name, e);
            }

            return(null);
        }
コード例 #5
0
        public override string[] ExtractKeyWordCandidatesFromFile()
        {
            string text = null;

            // If we have no bytes then we can't do anything.
            if (Bytes == null || Bytes.Length == 0)
            {
                // Log the problem.
                log.Error("Tried to extract creation date from empty bytes for file " + Name);
                return(null);
            }

            try
            {
                java.io.ByteArrayInputStream byteStream = new java.io.ByteArrayInputStream(Bytes);
                PDDocument doc = PDDocument.load(byteStream);

                // TODO Internationalize this conversion
                text = doc.getDocumentInformation().getKeywords();
            }
            catch (Exception e)
            {
                log.Warn("Failed to get the keywords from the PDF file " + Name, e);
            }

            string[] returnText = null;

            if (!string.IsNullOrEmpty(text))
            {
                returnText = text.Split(new char[] { ',', ';' });
            }

            return(returnText);
        }
コード例 #6
0
            public string parsePDF(string filepath)
            {
                PDDocument      document = PDDocument.load(filepath);
                PDFTextStripper stripper = new PDFTextStripper();

                return(stripper.getText(document));
            }
コード例 #7
0
        /// <summary>
        /// Get text from the binary using PDFBox
        /// </summary>
        /// <returns>The text of the binary or null if we could not process the text</returns>
        public override string GetTextFromDocumentBinary()
        {
            string text = null;

            // If we have no bytes then we can't do anything.
            if (Bytes == null || Bytes.Length == 0)
            {
                // Log the problem.
                log.Error("Tried to extract text from empty bytes for file " + Name);
                return(null);
            }

            try
            {
                java.io.ByteArrayInputStream byteStream = new java.io.ByteArrayInputStream(Bytes);
                PDDocument      doc      = PDDocument.load(byteStream);
                PDFTextStripper stripper = new PDFTextStripper();
                text = stripper.getText(doc);
            }
            catch (Exception e)
            {
                log.Error("Failed to get the text from the PDF file " + Name, e);
            }

            return(text);
        }
コード例 #8
0
ファイル: PdfController.cs プロジェクト: 943885179/CoreDemo
        /// <summary>
        /// 读取
        /// </summary>
        /// <param name="file"></param>
        /// <returns></returns>
        public string Read(IFormFile file)
        {
            /*var ss = Directory.GetCurrentDirectory();//获取项目路径
             * var st = _hostingEnvironment.ContentRootPath;//获取项目路径
             * var fileDir = ss+"\\pdf";
             * if (Directory.Exists(fileDir))
             * {
             *  Directory.Delete(fileDir,true);
             * }
             * Directory.CreateDirectory(fileDir);
             * //string fileName = file.FileName;
             * //string filePath= fileDir + $@"\{fileName}";
             * var fileName = DateTime.Now.ToString("yyyyMMddHHmmss") +
             *           Path.GetExtension(file.FileName);
             * var filePath = Path.Combine(fileDir, fileName);
             * using (FileStream fs = System.IO.File.Create(filePath))
             * {
             *  file.CopyTo(fs);
             *  fs.Flush();
             * }
             * var files = new FileInfo(filePath);*/
            //  string currentDirectory = Path.GetDirectoryName((new PdfController()).GetType().Assembly.Location);

            PDDocument doc = PDDocument.load(@"G:/Read.pdf");

            PDFTextStripper pdfStripper = new PDFTextStripper();
            string          text        = pdfStripper.getText(doc);

            return(text);
        }
コード例 #9
0
ファイル: PDFParser.cs プロジェクト: mlnethub/thrinax
        /// <summary>
        /// 通过文件流方式解析PDF
        /// </summary>
        /// <param name="pdfStream">PDF流</param>
        /// <param name="tableContainType">表格包含样式</param>
        /// <returns></returns>
        public static PDFModel Parser(byte[] pdfStream, TableContainType tableContainType)
        {
            PDFModel fileContent = null;
            //打开文件
            PDDocument reader = null;

            try
            {
                InputStream sbs = new ByteArrayInputStream(pdfStream);
                reader      = PDDocument.load(sbs);
                fileContent = Parser(reader, tableContainType);
            }
            catch (Exception ex)
            {
                if (reader != null)
                {
                    reader.close();
                    reader = null;
                }

                return(null);
            }
            finally
            {
                reader.close();
                reader = null;
            }

            return(fileContent);
        }
コード例 #10
0
        static void Main(string[] args)
        {
            if (args == null || args.Length < 2)
            {
                System.Console.WriteLine("Usage: " + AppDomain.CurrentDomain.FriendlyName + " <original PDF filename> <watermark text> [new PDF filename]" + Environment.NewLine +
                                         "  For example: " + AppDomain.CurrentDomain.FriendlyName + " myDoc.pdf \"This is a Draft\"");
            }
            else
            {
                string origName     = args[0];
                string watermarkTxt = args[1];
                if (!System.IO.File.Exists(origName))
                {
                    System.Console.WriteLine("Error: cannot find the original PDF file(" + origName + "). Please correct the filename or the path and try again.");
                }
                else
                {
                    PDDocument origDoc  = PDDocument.load(new java.io.File(origName)); // NOTE: PDDocument.load() only takes java.io.File, not System.IO.File from C#.Net
                    PDPageTree allPages = origDoc.getPages();
                    PDFont     font     = PDType1Font.HELVETICA_BOLD;
                    for (int i = 0, len = allPages.getCount(); i < len; ++i)
                    {
                        PDPage pg = (PDPage)allPages.get(i);
                        addWatermarkText(origDoc, pg, font, "This is a draft!!!");
                    }

                    origDoc.save("watermarked_" + origName);
                    origDoc.close();
                }
            }
        }
コード例 #11
0
ファイル: pdfbox.cs プロジェクト: mayurjansari/pdfComm
        public static String PDFText(String PDFFilePath)
        {
            PDDocument      doc      = PDDocument.load(PDFFilePath);
            PDFTextStripper stripper = new PDFTextStripper();

            return(stripper.getText(doc));
        }
コード例 #12
0
        public static Dictionary <int, string> Extract(string pdfFileName)
        {
            if (!File.Exists(pdfFileName))
            {
                throw new FileNotFoundException("pdfFileName");
            }

            var        result      = new Dictionary <int, string>();
            PDDocument pdfDocument = PDDocument.load(pdfFileName);

            var pdfStripper = new PDFTextStripper();

            pdfStripper.setPageSeparator(Environment.NewLine + Environment.NewLine);

            for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++)
            {
                pdfStripper.setStartPage(i);
                pdfStripper.setEndPage(i);

                //ExtractText(pdfStripper, pdfDocument,
                //  string.Format(@"c:\Users\tri.hoang\Desktop\temp\epub-belastingblad\2014-08\pdf\page_{0}.txt", i.ToString().PadLeft(5, '0')));

                result.Add(i, GetText(pdfStripper, pdfDocument));
            }

            pdfDocument.close();
            return(result);
        }
コード例 #13
0
ファイル: PDFValidator.cs プロジェクト: jehan2898/root
 public PDFValidator(string fileName, string password)
 {
     try
     {
         this.IsValid = true;
         if (!System.IO.File.Exists(fileName))
         {
             Console.WriteLine("The PDF file does not Exist.");
         }
         else
         {
             PDDocument pDDocument = null;
             pDDocument = (!string.IsNullOrEmpty(password) ? PDDocument.load(new java.io.File(fileName), password) : PDDocument.load(new java.io.File(fileName)));
             if (pDDocument.isEncrypted())
             {
                 this.IsPasswordProtected = true;
             }
             this.CheckAllPages(pDDocument);
             if (pDDocument != null)
             {
                 pDDocument.close();
             }
         }
     }
     catch (InvalidPasswordException invalidPasswordException)
     {
         this.IsPasswordProtected = true;
         this.IsValid             = false;
     }
     catch (Exception exception)
     {
         this.ErrorMessage = string.Format("PDF analysis failed With exception {0}", exception.Message);
         this.IsValid      = false;
     }
 }
コード例 #14
0
ファイル: PDFMerger.cs プロジェクト: jehan2898/root
 private void GetSubFolders(DirectoryInfo sourceDirectoryInfo)
 {
     DirectoryInfo[] directories = sourceDirectoryInfo.GetDirectories();
     for (int i = 0; i < (int)directories.Length; i++)
     {
         this.GetSubFolders(directories[i]);
     }
     this.mergeUtility = new PDFMergerUtility();
     FileInfo[] files = sourceDirectoryInfo.GetFiles("*.pdf");
     for (int j = 0; j < (int)files.Length; j++)
     {
         FileInfo fileInfo = files[j];
         this.mergeUtility.addSource(fileInfo.FullName);
     }
     if (!PDFHelper.AddStamp)
     {
         this.mergeUtility.setDestinationFileName(Path.Combine(this.outputFolder, string.Concat(sourceDirectoryInfo.Name, ".pdf")));
         this.mergeUtility.mergeDocuments();
     }
     else
     {
         string str = Path.Combine(Path.GetTempPath(), string.Concat("aquaforest\\pdftoolkit\\", Path.GetRandomFileName(), ".pdf"));
         this.mergeUtility.setDestinationFileName(str);
         this.mergeUtility.mergeDocuments();
         PDDocument pDDocument = PDDocument.load(new java.io.File(str));
         pDDocument = PDFHelper.AddTrialStampIfNecessary(pDDocument);
         pDDocument.save(Path.Combine(this.outputFolder, string.Concat(sourceDirectoryInfo.Name, ".pdf")));
         if (System.IO.File.Exists(str))
         {
             System.IO.File.Delete(str);
         }
     }
 }
コード例 #15
0
ファイル: LuceneClasses.cs プロジェクト: EugCrow/PdfSearch
    public static String PDFText(String PDFFilePath)
    {
        PDDocument      doc      = PDDocument.load(PDFFilePath);
        PDFTextStripper stripper = new PDFTextStripper();
        var             text     = " ";

        try
        {
            text = stripper.getText(doc);
            return(text);
        }
        catch (UnauthorizedAccessException e)
        {
            MessageBox.Show("Невозможно скопировать текст из Пдф " + PDFFilePath + ". " + e.Message, "Сообщение об ошибке");
            return("");
        }
        catch (FileLoadException FLe)
        {
            MessageBox.Show("Невозможно загрузить Пдф " + PDFFilePath + ". " + FLe.Message, "Сообщение об ошибке");
            return("");
        }
        catch when(text == "")
        {
            MessageBox.Show("Невозможно загрузить Пдф " + PDFFilePath + ". ", "Сообщение об ошибке");
            return("");
        }
        finally
        {
            doc.close();
        }
    }
        protected internal String ConvertPDFToDoc(string PDFpath)
        {
            try
            {
                PDDocument      PDFdoc     = null;
                PDFTextStripper textstrip  = new PDFTextStripper();
                String          StringDocx = String.Empty;
                String          DocxPath   = String.Empty;

                PDFdoc     = PDDocument.load(PDFpath);
                StringDocx = textstrip.getText(PDFdoc);
                PDFdoc.close(); //cierra el pdf

                ///DocxPath = fn.CreateFolderToSaveDocs(fn.GenerateName()); ///genera la ruta para guardar el archivo.
                DocxPath = fn.CreateFolderToSaveDocs(fileName); ///genera la ruta para guardar el archivo.
                var wordDoc = DocX.Create(DocxPath);
                wordDoc.InsertParagraph(StringDocx);
                wordDoc.Save();
                ////Process.Start("winword.exe", DocxPath);
                return(DocxPath);
            }
            catch (Exception)
            {
                return("");
            }
        }
コード例 #17
0
        /// <summary>
        /// Method for extracting PDF data
        /// </summary>
        /// <param name="filename"></param>
        /// <returns></returns>
        public string ExtractTextFromPdf(string filename)
        {
            String text = "";

            try{
                if (checkFileExists(filename))
                {
                    _log.Info(filename + "exists in the download folder");
                    PDDocument doc = null;
                    try{
                        doc = PDDocument.load(getPDFFilePath(filename));
                        PDFTextStripper stripper = new PDFTextStripper();
                        text = stripper.getText(doc);
                    }
                    catch (Exception e) {
                        _log.Info("Exception in Extracting data from file " + filename + ".pdf" + e.StackTrace);
                        _log.Info("Exception in Extracting data from file " + filename + ".pdf" + e.StackTrace);
                    }
                    finally{
                        if (doc != null)
                        {
                            doc.close();
                        }
                    }
                }
                else
                {
                    Assert.Fail("PDF file not found in 'Downloads' folder.");
                }
            }
            catch (Exception e) {
                _log.Info("Exception in extracting text from PDF: " + e.Message);
            }
            return(text);
        }
コード例 #18
0
        /**
         * 提取部分页面文本
         * @param file pdf文档路径
         * @param startPage 开始页数
         * @param endPage 结束页数
         */
        public static string ExtractTXT(String file, int startPage, int endPage)
        {
            String content = string.Empty;

            try
            {
                PDDocument document = PDDocument.load(file);
                //获取一个PDFTextStripper文本剥离对象
                PDFTextStripper stripper = new PDFTextStripper();

                // 设置按顺序输出
                stripper.setSortByPosition(true);

                // 设置起始页
                stripper.setStartPage(startPage);
                // 设置结束页
                stripper.setEndPage(endPage);
                //获取文本内容
                content = stripper.getText(document);
                document.close();
            }
            catch (java.io.FileNotFoundException ex)
            {
            }
            catch (java.io.IOException ex)
            {
            }
            return(content);
        }
        public Exception ShowZUGFeRD(string filepath, out string message)
        {
            try
            {
                PDDocument doc = PDDocument.load(filepath);

                // now check the contents (like MustangReaderTest)
                NE4ZUGFeRDImporter zi = new NE4ZUGFeRDImporter();
                zi.extract(filepath);

                // ZUGFeRD lesen
                if (zi.canParse())
                {
                    zi.parse();

                    // ZUGFeRD Daten als string zurück
                    message = string.Format("Menge: {0}\nRechnungsempfänger: {1}\nReferenz: {2}",
                                            zi.getAmount(), zi.getHolder(), zi.getForeignReference());
                }
                else
                {
                    message = "Keine ZUGFeRD Daten gefunden!";
                }

                //return ok
                return(null);
            }
            catch (Exception ex)
            {
                message = ex.InnerException.ToString();
                return(ex);
            }
        }
コード例 #20
0
        public static Info ReadDocInfo(string fileName)
        {
            Info result = new Info();

            try
            {
                PDDocument pDoc = PDDocument.load(fileName);

                PDDocumentInformation docInfo = pDoc.getDocumentInformation();

                if (docInfo != null)
                {
                    var author   = docInfo.getAuthor();
                    var title    = docInfo.getTitle();
                    var summary  = docInfo.getSubject();
                    var keywords = docInfo.getKeywords();

                    result.Author   = author;
                    result.Title    = title;
                    result.Summary  = summary;
                    result.Keywords = keywords;
                }
            }
            catch (Exception ex)
            {
            }
            return(result);
        }
コード例 #21
0
        private void btnShowPDF_Click(object sender, EventArgs e)
        {
            PDDocument      PDF      = PDDocument.load(textBox1.Text);
            PDFTextStripper stripper = new PDFTextStripper();

            richTextBox1.Text = (stripper.getText(PDF));
        }
コード例 #22
0
        private static void KamilPdfTest(string input)
        {
            PDDocument doc = null;

            try
            {
                doc = PDDocument.load(input);
                PDFTextStripper stripper = new PDFTextStripper();
                // stripper.getText(doc);


                Matrix line = stripper.getTextLineMatrix();
                // int page_nr = stripper.getCurrentPageNo();
                PDPage page     = stripper.getCurrentPage();
                Matrix line2    = stripper.getTextMatrix();
                int    char_cnt = stripper.getTotalCharCnt();

                string article_start = stripper.getArticleStart();
                string article_end   = stripper.getArticleEnd();



                string pdf = stripper.getText(doc);                                     // wrzuca caly tekst do sringa - dziala
                char_cnt = pdf.Length;
            }
            finally
            {
                if (doc != null)
                {
                    doc.close();
                }
            }
        }
コード例 #23
0
ファイル: ImportText.cs プロジェクト: webwahab/YouWrite
        public string parseUsingPDFBox(string input)
        {
            PDDocument      doc      = PDDocument.load(input);
            PDFTextStripper stripper = new PDFTextStripper();

            return(stripper.getText(doc));
        }
コード例 #24
0
ファイル: PDFParser.cs プロジェクト: radtek/funfuncode
        public void ExtractText(string inpufFileName, string outputFileName)
        {
#if false
            IFilterTextReader.FilterReader reader = new FilterReader(inpufFileName);
            var data = reader.ReadToEnd();
            using (var writer = new StreamWriter(outputFileName, false, System.Text.Encoding.UTF8))
            {
                writer.Write(data);
            }
#else
            PDDocument doc = null;
            try
            {
                doc = PDDocument.load(inpufFileName);
                PDFTextStripper stripper = new PDFTextStripper();
                using (var writer = new StreamWriter(outputFileName, false, System.Text.Encoding.UTF8))
                {
                    writer.Write(stripper.getText(doc));
                }
            }
            finally
            {
                if (doc != null)
                {
                    doc.close();
                }
            }
#endif
        }
コード例 #25
0
        static void Main(string[] args)
        {
            PDDocument      doc         = PDDocument.load("lopreacamasa.pdf");
            PDFTextStripper pdfStripper = new PDFTextStripper();

            Console.Write(pdfStripper.getText(doc));
        }
コード例 #26
0
        private string GetTextFromPdfFile(string fileName)
        {
            PDDocument      doc      = PDDocument.load(fileName);
            PDFTextStripper stripper = new PDFTextStripper();

            return(stripper.getText(doc));
        }
コード例 #27
0
        private static string ReadPdf(string path)
        {
            PDDocument      doc      = PDDocument.load(path);
            PDFTextStripper stripper = new PDFTextStripper();
            string          text     = stripper.getText(doc);

            return(text);
        }
コード例 #28
0
 public ReportReaderTest_11062013()
 {
     //
     // TODO: Add constructor logic here
     //
     //doc = PDDocument.load(@"C:\Users\Arnold\Documents\Projects\PSEGet3\trunk\PSEGetTest\stockQuotes_11062013.pdf");
     doc = PDDocument.load(@"C:\PSEGet\Reports\stockQuotes_12172013.pdf");
 }
コード例 #29
0
        public static string PdfFileReader(FileInfo fileName)
        {
            PDDocument      doc         = PDDocument.load(fileName.FullName);
            PDFTextStripper pdfStripper = new PDFTextStripper();
            string          text        = pdfStripper.getText(doc);

            return(text);
        }
コード例 #30
0
ファイル: PdfboxComponent.cs プロジェクト: wingfay/studio
 public void ToTxt(string absoluteFilePath, string outputPath)
 {
     using (PDDocument pdf = PDDocument.load(new java.io.File(absoluteFilePath)))
     {
         Writer output = new PrintWriter(outputPath, "utf-8");
         //new PDFDomTree().writeText(pdf, output);
         output.close();
     }
 }