public void MergePDFs(List <string> sourcePDFs, string outputFile) { PDFHelper.DisplayTrialPopupIfNecessary(); foreach (string sourcePDF in sourcePDFs) { this.mergeUtility.addSource(sourcePDF); } if (!PDFHelper.AddStamp) { this.mergeUtility.setDestinationFileName(outputFile); this.mergeUtility.mergeDocuments(); } else { string str = Path.Combine(Path.GetTempPath(), string.Concat(Path.GetRandomFileName(), ".pdf")); this.mergeUtility.setDestinationFileName(str); this.mergeUtility.mergeDocuments(); PDDocument pDDocument = PDDocument.load(new java.io.File(str)); pDDocument = PDFHelper.AddTrialStampIfNecessary(pDDocument); pDDocument.save(outputFile); if (pDDocument != null) { pDDocument.close(); } if (System.IO.File.Exists(str)) { System.IO.File.Delete(str); } } }
public static Dictionary <int, string> Extract(string pdfFileName) { if (!File.Exists(pdfFileName)) { throw new FileNotFoundException("pdfFileName"); } var result = new Dictionary <int, string>(); PDDocument pdfDocument = PDDocument.load(pdfFileName); var pdfStripper = new PDFTextStripper(); pdfStripper.setPageSeparator(Environment.NewLine + Environment.NewLine); for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) { pdfStripper.setStartPage(i); pdfStripper.setEndPage(i); result.Add(i, GetText(pdfStripper, pdfDocument)); } pdfDocument.close(); return(result); }
/// <summary> /// Validates the PDF File Downloaded and the text present in the file /// </summary> /// <param name="pdfFileName">The folder or filepath.</param> /// <param name="textToCheck">The text to be validated.</param> /// <returns>if the text is present or not</returns> public static bool ExtractAndValidateTextFromPDF(string pdfFileName, string textToCheck) { try { string result = string.Empty; bool validateText = false; PDDocument doc = PDDocument.load(pdfFileName); PDFTextStripper stripper = new PDFTextStripper(); result = stripper.getText(doc); doc.close(); if (result.Length != 0) { byte[] bytes = Encoding.Default.GetBytes(result.ToString()); string decodedresult = Encoding.UTF8.GetString(bytes); string outputPDF = decodedresult.Replace(" ", null).Replace("\r", null).Replace("\n", null); string validationString = textToCheck.Replace(" ", null); validateText = outputPDF.Contains(validationString); } return(validateText); } catch (Exception) { throw; } }
/// <summary> /// Get a thumbnail of the document, if possible /// </summary> /// <param name="sizeX">The maximum X size of the thumbnail</param> /// <param name="sizeY">The maximum y size of the thumbnail</param> /// <param name="forceFullSize">True if the thumbnail should be exatly XxY pixels and False if the thumbnail /// should fit inside a XxY box but should maintain its aspect ratio</param> /// <returns>A JPEG byte thumbnail or null if the thumbnail can´t be generated</returns> public override byte[] GetThumbnail(int sizeX, int sizeY, bool forceFullSize) { // If we have no bytes then we can't do anything. if (Bytes == null || Bytes.Length == 0) { return(null); } try { org.pdfbox.pdfviewer.PageDrawer pagedrawer = new org.pdfbox.pdfviewer.PageDrawer(); java.io.ByteArrayInputStream byteStream = new java.io.ByteArrayInputStream(Bytes); PDDocument doc = PDDocument.load(byteStream); int count = doc.getNumberOfPages(); java.util.List pages = doc.getDocumentCatalog().getAllPages(); if (pages.size() > 0) { PDPage page = pagedrawer.getPage(); java.awt.image.BufferedImage image = page.convertToImage(); java.io.ByteArrayOutputStream os = new java.io.ByteArrayOutputStream(); ImageIO.write(image, "jpg", os); byte[] data = os.toByteArray(); return(data); } } catch (Exception e) { log.Error("Failed to get the thumbnail from the PDF file " + Name, e); } return(null); }
public override string[] ExtractKeyWordCandidatesFromFile() { string text = null; // If we have no bytes then we can't do anything. if (Bytes == null || Bytes.Length == 0) { // Log the problem. log.Error("Tried to extract creation date from empty bytes for file " + Name); return(null); } try { java.io.ByteArrayInputStream byteStream = new java.io.ByteArrayInputStream(Bytes); PDDocument doc = PDDocument.load(byteStream); // TODO Internationalize this conversion text = doc.getDocumentInformation().getKeywords(); } catch (Exception e) { log.Warn("Failed to get the keywords from the PDF file " + Name, e); } string[] returnText = null; if (!string.IsNullOrEmpty(text)) { returnText = text.Split(new char[] { ',', ';' }); } return(returnText); }
public string parsePDF(string filepath) { PDDocument document = PDDocument.load(filepath); PDFTextStripper stripper = new PDFTextStripper(); return(stripper.getText(document)); }
/// <summary> /// Get text from the binary using PDFBox /// </summary> /// <returns>The text of the binary or null if we could not process the text</returns> public override string GetTextFromDocumentBinary() { string text = null; // If we have no bytes then we can't do anything. if (Bytes == null || Bytes.Length == 0) { // Log the problem. log.Error("Tried to extract text from empty bytes for file " + Name); return(null); } try { java.io.ByteArrayInputStream byteStream = new java.io.ByteArrayInputStream(Bytes); PDDocument doc = PDDocument.load(byteStream); PDFTextStripper stripper = new PDFTextStripper(); text = stripper.getText(doc); } catch (Exception e) { log.Error("Failed to get the text from the PDF file " + Name, e); } return(text); }
/// <summary> /// 读取 /// </summary> /// <param name="file"></param> /// <returns></returns> public string Read(IFormFile file) { /*var ss = Directory.GetCurrentDirectory();//获取项目路径 * var st = _hostingEnvironment.ContentRootPath;//获取项目路径 * var fileDir = ss+"\\pdf"; * if (Directory.Exists(fileDir)) * { * Directory.Delete(fileDir,true); * } * Directory.CreateDirectory(fileDir); * //string fileName = file.FileName; * //string filePath= fileDir + $@"\{fileName}"; * var fileName = DateTime.Now.ToString("yyyyMMddHHmmss") + * Path.GetExtension(file.FileName); * var filePath = Path.Combine(fileDir, fileName); * using (FileStream fs = System.IO.File.Create(filePath)) * { * file.CopyTo(fs); * fs.Flush(); * } * var files = new FileInfo(filePath);*/ // string currentDirectory = Path.GetDirectoryName((new PdfController()).GetType().Assembly.Location); PDDocument doc = PDDocument.load(@"G:/Read.pdf"); PDFTextStripper pdfStripper = new PDFTextStripper(); string text = pdfStripper.getText(doc); return(text); }
/// <summary> /// 通过文件流方式解析PDF /// </summary> /// <param name="pdfStream">PDF流</param> /// <param name="tableContainType">表格包含样式</param> /// <returns></returns> public static PDFModel Parser(byte[] pdfStream, TableContainType tableContainType) { PDFModel fileContent = null; //打开文件 PDDocument reader = null; try { InputStream sbs = new ByteArrayInputStream(pdfStream); reader = PDDocument.load(sbs); fileContent = Parser(reader, tableContainType); } catch (Exception ex) { if (reader != null) { reader.close(); reader = null; } return(null); } finally { reader.close(); reader = null; } return(fileContent); }
static void Main(string[] args) { if (args == null || args.Length < 2) { System.Console.WriteLine("Usage: " + AppDomain.CurrentDomain.FriendlyName + " <original PDF filename> <watermark text> [new PDF filename]" + Environment.NewLine + " For example: " + AppDomain.CurrentDomain.FriendlyName + " myDoc.pdf \"This is a Draft\""); } else { string origName = args[0]; string watermarkTxt = args[1]; if (!System.IO.File.Exists(origName)) { System.Console.WriteLine("Error: cannot find the original PDF file(" + origName + "). Please correct the filename or the path and try again."); } else { PDDocument origDoc = PDDocument.load(new java.io.File(origName)); // NOTE: PDDocument.load() only takes java.io.File, not System.IO.File from C#.Net PDPageTree allPages = origDoc.getPages(); PDFont font = PDType1Font.HELVETICA_BOLD; for (int i = 0, len = allPages.getCount(); i < len; ++i) { PDPage pg = (PDPage)allPages.get(i); addWatermarkText(origDoc, pg, font, "This is a draft!!!"); } origDoc.save("watermarked_" + origName); origDoc.close(); } } }
public static String PDFText(String PDFFilePath) { PDDocument doc = PDDocument.load(PDFFilePath); PDFTextStripper stripper = new PDFTextStripper(); return(stripper.getText(doc)); }
public static Dictionary <int, string> Extract(string pdfFileName) { if (!File.Exists(pdfFileName)) { throw new FileNotFoundException("pdfFileName"); } var result = new Dictionary <int, string>(); PDDocument pdfDocument = PDDocument.load(pdfFileName); var pdfStripper = new PDFTextStripper(); pdfStripper.setPageSeparator(Environment.NewLine + Environment.NewLine); for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) { pdfStripper.setStartPage(i); pdfStripper.setEndPage(i); //ExtractText(pdfStripper, pdfDocument, // string.Format(@"c:\Users\tri.hoang\Desktop\temp\epub-belastingblad\2014-08\pdf\page_{0}.txt", i.ToString().PadLeft(5, '0'))); result.Add(i, GetText(pdfStripper, pdfDocument)); } pdfDocument.close(); return(result); }
public PDFValidator(string fileName, string password) { try { this.IsValid = true; if (!System.IO.File.Exists(fileName)) { Console.WriteLine("The PDF file does not Exist."); } else { PDDocument pDDocument = null; pDDocument = (!string.IsNullOrEmpty(password) ? PDDocument.load(new java.io.File(fileName), password) : PDDocument.load(new java.io.File(fileName))); if (pDDocument.isEncrypted()) { this.IsPasswordProtected = true; } this.CheckAllPages(pDDocument); if (pDDocument != null) { pDDocument.close(); } } } catch (InvalidPasswordException invalidPasswordException) { this.IsPasswordProtected = true; this.IsValid = false; } catch (Exception exception) { this.ErrorMessage = string.Format("PDF analysis failed With exception {0}", exception.Message); this.IsValid = false; } }
private void GetSubFolders(DirectoryInfo sourceDirectoryInfo) { DirectoryInfo[] directories = sourceDirectoryInfo.GetDirectories(); for (int i = 0; i < (int)directories.Length; i++) { this.GetSubFolders(directories[i]); } this.mergeUtility = new PDFMergerUtility(); FileInfo[] files = sourceDirectoryInfo.GetFiles("*.pdf"); for (int j = 0; j < (int)files.Length; j++) { FileInfo fileInfo = files[j]; this.mergeUtility.addSource(fileInfo.FullName); } if (!PDFHelper.AddStamp) { this.mergeUtility.setDestinationFileName(Path.Combine(this.outputFolder, string.Concat(sourceDirectoryInfo.Name, ".pdf"))); this.mergeUtility.mergeDocuments(); } else { string str = Path.Combine(Path.GetTempPath(), string.Concat("aquaforest\\pdftoolkit\\", Path.GetRandomFileName(), ".pdf")); this.mergeUtility.setDestinationFileName(str); this.mergeUtility.mergeDocuments(); PDDocument pDDocument = PDDocument.load(new java.io.File(str)); pDDocument = PDFHelper.AddTrialStampIfNecessary(pDDocument); pDDocument.save(Path.Combine(this.outputFolder, string.Concat(sourceDirectoryInfo.Name, ".pdf"))); if (System.IO.File.Exists(str)) { System.IO.File.Delete(str); } } }
public static String PDFText(String PDFFilePath) { PDDocument doc = PDDocument.load(PDFFilePath); PDFTextStripper stripper = new PDFTextStripper(); var text = " "; try { text = stripper.getText(doc); return(text); } catch (UnauthorizedAccessException e) { MessageBox.Show("Невозможно скопировать текст из Пдф " + PDFFilePath + ". " + e.Message, "Сообщение об ошибке"); return(""); } catch (FileLoadException FLe) { MessageBox.Show("Невозможно загрузить Пдф " + PDFFilePath + ". " + FLe.Message, "Сообщение об ошибке"); return(""); } catch when(text == "") { MessageBox.Show("Невозможно загрузить Пдф " + PDFFilePath + ". ", "Сообщение об ошибке"); return(""); } finally { doc.close(); } }
protected internal String ConvertPDFToDoc(string PDFpath) { try { PDDocument PDFdoc = null; PDFTextStripper textstrip = new PDFTextStripper(); String StringDocx = String.Empty; String DocxPath = String.Empty; PDFdoc = PDDocument.load(PDFpath); StringDocx = textstrip.getText(PDFdoc); PDFdoc.close(); //cierra el pdf ///DocxPath = fn.CreateFolderToSaveDocs(fn.GenerateName()); ///genera la ruta para guardar el archivo. DocxPath = fn.CreateFolderToSaveDocs(fileName); ///genera la ruta para guardar el archivo. var wordDoc = DocX.Create(DocxPath); wordDoc.InsertParagraph(StringDocx); wordDoc.Save(); ////Process.Start("winword.exe", DocxPath); return(DocxPath); } catch (Exception) { return(""); } }
/// <summary> /// Method for extracting PDF data /// </summary> /// <param name="filename"></param> /// <returns></returns> public string ExtractTextFromPdf(string filename) { String text = ""; try{ if (checkFileExists(filename)) { _log.Info(filename + "exists in the download folder"); PDDocument doc = null; try{ doc = PDDocument.load(getPDFFilePath(filename)); PDFTextStripper stripper = new PDFTextStripper(); text = stripper.getText(doc); } catch (Exception e) { _log.Info("Exception in Extracting data from file " + filename + ".pdf" + e.StackTrace); _log.Info("Exception in Extracting data from file " + filename + ".pdf" + e.StackTrace); } finally{ if (doc != null) { doc.close(); } } } else { Assert.Fail("PDF file not found in 'Downloads' folder."); } } catch (Exception e) { _log.Info("Exception in extracting text from PDF: " + e.Message); } return(text); }
/** * 提取部分页面文本 * @param file pdf文档路径 * @param startPage 开始页数 * @param endPage 结束页数 */ public static string ExtractTXT(String file, int startPage, int endPage) { String content = string.Empty; try { PDDocument document = PDDocument.load(file); //获取一个PDFTextStripper文本剥离对象 PDFTextStripper stripper = new PDFTextStripper(); // 设置按顺序输出 stripper.setSortByPosition(true); // 设置起始页 stripper.setStartPage(startPage); // 设置结束页 stripper.setEndPage(endPage); //获取文本内容 content = stripper.getText(document); document.close(); } catch (java.io.FileNotFoundException ex) { } catch (java.io.IOException ex) { } return(content); }
public Exception ShowZUGFeRD(string filepath, out string message) { try { PDDocument doc = PDDocument.load(filepath); // now check the contents (like MustangReaderTest) NE4ZUGFeRDImporter zi = new NE4ZUGFeRDImporter(); zi.extract(filepath); // ZUGFeRD lesen if (zi.canParse()) { zi.parse(); // ZUGFeRD Daten als string zurück message = string.Format("Menge: {0}\nRechnungsempfänger: {1}\nReferenz: {2}", zi.getAmount(), zi.getHolder(), zi.getForeignReference()); } else { message = "Keine ZUGFeRD Daten gefunden!"; } //return ok return(null); } catch (Exception ex) { message = ex.InnerException.ToString(); return(ex); } }
public static Info ReadDocInfo(string fileName) { Info result = new Info(); try { PDDocument pDoc = PDDocument.load(fileName); PDDocumentInformation docInfo = pDoc.getDocumentInformation(); if (docInfo != null) { var author = docInfo.getAuthor(); var title = docInfo.getTitle(); var summary = docInfo.getSubject(); var keywords = docInfo.getKeywords(); result.Author = author; result.Title = title; result.Summary = summary; result.Keywords = keywords; } } catch (Exception ex) { } return(result); }
private void btnShowPDF_Click(object sender, EventArgs e) { PDDocument PDF = PDDocument.load(textBox1.Text); PDFTextStripper stripper = new PDFTextStripper(); richTextBox1.Text = (stripper.getText(PDF)); }
private static void KamilPdfTest(string input) { PDDocument doc = null; try { doc = PDDocument.load(input); PDFTextStripper stripper = new PDFTextStripper(); // stripper.getText(doc); Matrix line = stripper.getTextLineMatrix(); // int page_nr = stripper.getCurrentPageNo(); PDPage page = stripper.getCurrentPage(); Matrix line2 = stripper.getTextMatrix(); int char_cnt = stripper.getTotalCharCnt(); string article_start = stripper.getArticleStart(); string article_end = stripper.getArticleEnd(); string pdf = stripper.getText(doc); // wrzuca caly tekst do sringa - dziala char_cnt = pdf.Length; } finally { if (doc != null) { doc.close(); } } }
public string parseUsingPDFBox(string input) { PDDocument doc = PDDocument.load(input); PDFTextStripper stripper = new PDFTextStripper(); return(stripper.getText(doc)); }
public void ExtractText(string inpufFileName, string outputFileName) { #if false IFilterTextReader.FilterReader reader = new FilterReader(inpufFileName); var data = reader.ReadToEnd(); using (var writer = new StreamWriter(outputFileName, false, System.Text.Encoding.UTF8)) { writer.Write(data); } #else PDDocument doc = null; try { doc = PDDocument.load(inpufFileName); PDFTextStripper stripper = new PDFTextStripper(); using (var writer = new StreamWriter(outputFileName, false, System.Text.Encoding.UTF8)) { writer.Write(stripper.getText(doc)); } } finally { if (doc != null) { doc.close(); } } #endif }
static void Main(string[] args) { PDDocument doc = PDDocument.load("lopreacamasa.pdf"); PDFTextStripper pdfStripper = new PDFTextStripper(); Console.Write(pdfStripper.getText(doc)); }
private string GetTextFromPdfFile(string fileName) { PDDocument doc = PDDocument.load(fileName); PDFTextStripper stripper = new PDFTextStripper(); return(stripper.getText(doc)); }
private static string ReadPdf(string path) { PDDocument doc = PDDocument.load(path); PDFTextStripper stripper = new PDFTextStripper(); string text = stripper.getText(doc); return(text); }
public ReportReaderTest_11062013() { // // TODO: Add constructor logic here // //doc = PDDocument.load(@"C:\Users\Arnold\Documents\Projects\PSEGet3\trunk\PSEGetTest\stockQuotes_11062013.pdf"); doc = PDDocument.load(@"C:\PSEGet\Reports\stockQuotes_12172013.pdf"); }
public static string PdfFileReader(FileInfo fileName) { PDDocument doc = PDDocument.load(fileName.FullName); PDFTextStripper pdfStripper = new PDFTextStripper(); string text = pdfStripper.getText(doc); return(text); }
public void ToTxt(string absoluteFilePath, string outputPath) { using (PDDocument pdf = PDDocument.load(new java.io.File(absoluteFilePath))) { Writer output = new PrintWriter(outputPath, "utf-8"); //new PDFDomTree().writeText(pdf, output); output.close(); } }