/// <summary> /// 通过文件流方式解析PDF /// </summary> /// <param name="pdfStream">PDF流</param> /// <param name="tableContainType">表格包含样式</param> /// <returns></returns> public static PDFModel Parser(byte[] pdfStream, TableContainType tableContainType) { PDFModel fileContent = null; //打开文件 PDDocument reader = null; try { InputStream sbs = new ByteArrayInputStream(pdfStream); reader = PDDocument.load(sbs); fileContent = Parser(reader, tableContainType); } catch (Exception ex) { if (reader != null) { reader.close(); reader = null; } return(null); } finally { reader.close(); reader = null; } return(fileContent); }
public static Dictionary <int, string> Extract(string pdfFileName) { if (!File.Exists(pdfFileName)) { throw new FileNotFoundException("pdfFileName"); } var result = new Dictionary <int, string>(); PDDocument pdfDocument = PDDocument.load(pdfFileName); var pdfStripper = new PDFTextStripper(); pdfStripper.setPageSeparator(Environment.NewLine + Environment.NewLine); for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) { pdfStripper.setStartPage(i); pdfStripper.setEndPage(i); //ExtractText(pdfStripper, pdfDocument, // string.Format(@"c:\Users\tri.hoang\Desktop\temp\epub-belastingblad\2014-08\pdf\page_{0}.txt", i.ToString().PadLeft(5, '0'))); result.Add(i, GetText(pdfStripper, pdfDocument)); } pdfDocument.close(); return(result); }
public PDFValidator(string fileName, string password) { try { this.IsValid = true; if (!System.IO.File.Exists(fileName)) { Console.WriteLine("The PDF file does not Exist."); } else { PDDocument pDDocument = null; pDDocument = (!string.IsNullOrEmpty(password) ? PDDocument.load(new java.io.File(fileName), password) : PDDocument.load(new java.io.File(fileName))); if (pDDocument.isEncrypted()) { this.IsPasswordProtected = true; } this.CheckAllPages(pDDocument); if (pDDocument != null) { pDDocument.close(); } } } catch (InvalidPasswordException invalidPasswordException) { this.IsPasswordProtected = true; this.IsValid = false; } catch (Exception exception) { this.ErrorMessage = string.Format("PDF analysis failed With exception {0}", exception.Message); this.IsValid = false; } }
static void Main(string[] args) { if (args == null || args.Length < 2) { System.Console.WriteLine("Usage: " + AppDomain.CurrentDomain.FriendlyName + " <original PDF filename> <watermark text> [new PDF filename]" + Environment.NewLine + " For example: " + AppDomain.CurrentDomain.FriendlyName + " myDoc.pdf \"This is a Draft\""); } else { string origName = args[0]; string watermarkTxt = args[1]; if (!System.IO.File.Exists(origName)) { System.Console.WriteLine("Error: cannot find the original PDF file(" + origName + "). Please correct the filename or the path and try again."); } else { PDDocument origDoc = PDDocument.load(new java.io.File(origName)); // NOTE: PDDocument.load() only takes java.io.File, not System.IO.File from C#.Net PDPageTree allPages = origDoc.getPages(); PDFont font = PDType1Font.HELVETICA_BOLD; for (int i = 0, len = allPages.getCount(); i < len; ++i) { PDPage pg = (PDPage)allPages.get(i); addWatermarkText(origDoc, pg, font, "This is a draft!!!"); } origDoc.save("watermarked_" + origName); origDoc.close(); } } }
protected internal String ConvertPDFToDoc(string PDFpath) { try { PDDocument PDFdoc = null; PDFTextStripper textstrip = new PDFTextStripper(); String StringDocx = String.Empty; String DocxPath = String.Empty; PDFdoc = PDDocument.load(PDFpath); StringDocx = textstrip.getText(PDFdoc); PDFdoc.close(); //cierra el pdf ///DocxPath = fn.CreateFolderToSaveDocs(fn.GenerateName()); ///genera la ruta para guardar el archivo. DocxPath = fn.CreateFolderToSaveDocs(fileName); ///genera la ruta para guardar el archivo. var wordDoc = DocX.Create(DocxPath); wordDoc.InsertParagraph(StringDocx); wordDoc.Save(); ////Process.Start("winword.exe", DocxPath); return(DocxPath); } catch (Exception) { return(""); } }
/// <summary> /// Method for extracting PDF data /// </summary> /// <param name="filename"></param> /// <returns></returns> public string ExtractTextFromPdf(string filename) { String text = ""; try{ if (checkFileExists(filename)) { _log.Info(filename + "exists in the download folder"); PDDocument doc = null; try{ doc = PDDocument.load(getPDFFilePath(filename)); PDFTextStripper stripper = new PDFTextStripper(); text = stripper.getText(doc); } catch (Exception e) { _log.Info("Exception in Extracting data from file " + filename + ".pdf" + e.StackTrace); _log.Info("Exception in Extracting data from file " + filename + ".pdf" + e.StackTrace); } finally{ if (doc != null) { doc.close(); } } } else { Assert.Fail("PDF file not found in 'Downloads' folder."); } } catch (Exception e) { _log.Info("Exception in extracting text from PDF: " + e.Message); } return(text); }
public void MergePDFs(List <string> sourcePDFs, string outputFile) { PDFHelper.DisplayTrialPopupIfNecessary(); foreach (string sourcePDF in sourcePDFs) { this.mergeUtility.addSource(sourcePDF); } if (!PDFHelper.AddStamp) { this.mergeUtility.setDestinationFileName(outputFile); this.mergeUtility.mergeDocuments(); } else { string str = Path.Combine(Path.GetTempPath(), string.Concat(Path.GetRandomFileName(), ".pdf")); this.mergeUtility.setDestinationFileName(str); this.mergeUtility.mergeDocuments(); PDDocument pDDocument = PDDocument.load(new java.io.File(str)); pDDocument = PDFHelper.AddTrialStampIfNecessary(pDDocument); pDDocument.save(outputFile); if (pDDocument != null) { pDDocument.close(); } if (System.IO.File.Exists(str)) { System.IO.File.Delete(str); } } }
public static String PDFText(String PDFFilePath) { PDDocument doc = PDDocument.load(PDFFilePath); PDFTextStripper stripper = new PDFTextStripper(); var text = " "; try { text = stripper.getText(doc); return(text); } catch (UnauthorizedAccessException e) { MessageBox.Show("Невозможно скопировать текст из Пдф " + PDFFilePath + ". " + e.Message, "Сообщение об ошибке"); return(""); } catch (FileLoadException FLe) { MessageBox.Show("Невозможно загрузить Пдф " + PDFFilePath + ". " + FLe.Message, "Сообщение об ошибке"); return(""); } catch when(text == "") { MessageBox.Show("Невозможно загрузить Пдф " + PDFFilePath + ". ", "Сообщение об ошибке"); return(""); } finally { doc.close(); } }
public PdfOcrResult Execute(byte[] bytes) { PDDocument document = null; try { LoadPdf(bytes, ref document); List allPages = document.getDocumentCatalog().getAllPages(); if (allPages.size() == 0) { throw new PdfNotReadableException("Pdf contains no readable content"); } //only first page PDPage page = (PDPage)allPages.get(0); PDStream contents = page.getContents(); if (contents == null) { throw new PdfNotReadableException("Pdf contains no readable content"); } var items = new PdfToCharacters().GetItems(page, page.findResources(), page.getContents().getStream()); if (items.Count == 0) { throw new PdfNotReadableException("Pdf contains no readable content"); } var mediaBox = page.findMediaBox(); var height = mediaBox?.getHeight() ?? 0; var width = mediaBox?.getWidth() ?? 0; var itemsArray = items.ToArray(); var keywords = ""; try { keywords = document.getDocumentInformation()?.getKeywords(); } catch (Exception) { } // we do not know if PDF box can fail on this, if there is no keywords or something else. We dont really care we just want the keywords if possible. return(new PdfOcrResult() { Items = itemsArray, Height = height, Width = width, Keywords = keywords }); } catch (PdfReadException) { throw; } catch (Exception e) { throw new PdfReadException("Pdf could not be loaded. It is not a redable pdf.", e); } finally { document?.close(); } }
/// <summary> /// Validates the PDF File Downloaded and the text present in the file /// </summary> /// <param name="pdfFileName">The folder or filepath.</param> /// <param name="textToCheck">The text to be validated.</param> /// <returns>if the text is present or not</returns> public static bool ExtractAndValidateTextFromPDF(string pdfFileName, string textToCheck) { try { string result = string.Empty; bool validateText = false; PDDocument doc = PDDocument.load(pdfFileName); PDFTextStripper stripper = new PDFTextStripper(); result = stripper.getText(doc); doc.close(); if (result.Length != 0) { byte[] bytes = Encoding.Default.GetBytes(result.ToString()); string decodedresult = Encoding.UTF8.GetString(bytes); string outputPDF = decodedresult.Replace(" ", null).Replace("\r", null).Replace("\n", null); string validationString = textToCheck.Replace(" ", null); validateText = outputPDF.Contains(validationString); } return(validateText); } catch (Exception) { throw; } }
private static void KamilPdfTest(string input) { PDDocument doc = null; try { doc = PDDocument.load(input); PDFTextStripper stripper = new PDFTextStripper(); // stripper.getText(doc); Matrix line = stripper.getTextLineMatrix(); // int page_nr = stripper.getCurrentPageNo(); PDPage page = stripper.getCurrentPage(); Matrix line2 = stripper.getTextMatrix(); int char_cnt = stripper.getTotalCharCnt(); string article_start = stripper.getArticleStart(); string article_end = stripper.getArticleEnd(); string pdf = stripper.getText(doc); // wrzuca caly tekst do sringa - dziala char_cnt = pdf.Length; } finally { if (doc != null) { doc.close(); } } }
public void ExtractText(string inpufFileName, string outputFileName) { #if false IFilterTextReader.FilterReader reader = new FilterReader(inpufFileName); var data = reader.ReadToEnd(); using (var writer = new StreamWriter(outputFileName, false, System.Text.Encoding.UTF8)) { writer.Write(data); } #else PDDocument doc = null; try { doc = PDDocument.load(inpufFileName); PDFTextStripper stripper = new PDFTextStripper(); using (var writer = new StreamWriter(outputFileName, false, System.Text.Encoding.UTF8)) { writer.Write(stripper.getText(doc)); } } finally { if (doc != null) { doc.close(); } } #endif }
public static Dictionary <int, string> Extract(string pdfFileName) { if (!File.Exists(pdfFileName)) { throw new FileNotFoundException("pdfFileName"); } var result = new Dictionary <int, string>(); PDDocument pdfDocument = PDDocument.load(pdfFileName); var pdfStripper = new PDFTextStripper(); pdfStripper.setPageSeparator(Environment.NewLine + Environment.NewLine); for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) { pdfStripper.setStartPage(i); pdfStripper.setEndPage(i); result.Add(i, GetText(pdfStripper, pdfDocument)); } pdfDocument.close(); return(result); }
/** * 提取部分页面文本 * @param file pdf文档路径 * @param startPage 开始页数 * @param endPage 结束页数 */ public static string ExtractTXT(String file, int startPage, int endPage) { String content = string.Empty; try { PDDocument document = PDDocument.load(file); //获取一个PDFTextStripper文本剥离对象 PDFTextStripper stripper = new PDFTextStripper(); // 设置按顺序输出 stripper.setSortByPosition(true); // 设置起始页 stripper.setStartPage(startPage); // 设置结束页 stripper.setEndPage(endPage); //获取文本内容 content = stripper.getText(document); document.close(); } catch (java.io.FileNotFoundException ex) { } catch (java.io.IOException ex) { } return(content); }
/** * string getFilePath(string path) * { * // Specify the path to save the uploaded file to. * string savePath = "C:\\Users\\DR.AKUL\\Documents\\Visual Studio 2010\\Projects\\PlagijatorFinder\\PlagijatorFinder\\uploadFiles\\"; * * // Get the name of the file to upload. * string fileName = FileUpload1.FileName; * * // Create the path and file name to check for duplicates. * path = savePath + fileName + ".txt"; * return path; * } **/ private static string parseUsingPDFBox(string filename) { PDDocument doc = PDDocument.load(filename); PDFTextStripper stripper = new PDFTextStripper(); string text = stripper.getText(doc); doc.close(); return(text); }
public string ParseFile(string path) { PDDocument doc = PDDocument.load(path); PDFTextStripper stripper = new PDFTextStripper(); string text = stripper.getText(doc); doc.close(); return(text); }
private void button_Click(object sender, RoutedEventArgs e) { PDDocument doc = null; Mouse.OverrideCursor = Cursors.Wait; try { doc = PDDocument.load(Properties.Settings.Default.PdfPath); PDFTextStripper stripper = new PDFTextStripper(); string data = stripper.getText(doc); MatchCollection match = Regex.Matches(data, "FA\\d{8}", RegexOptions.IgnoreCase); if (match.Count > 0) { try { XmlDocument xml = new XmlDocument(); xml.Load(this.filePath.Text); xml.Save(this.filePath.Text + ".backup"); var manager = new XmlNamespaceManager(xml.NameTable); manager.AddNamespace("dat", "http://www.stormware.cz/schema/version_2/data.xsd"); int count = 0; foreach (var item in match) { string request = "/dat:dataPack/dat:dataPackItem[@id=\"" + item + "\"]"; try { foreach (XmlNode node in xml.SelectNodes(request, manager)) { node.ParentNode.RemoveChild(node); count++; } } catch (Exception ex) { } } xml.Save(this.filePath.Text); MessageBox.Show("Erased " + count + " items."); } catch (Exception ex) { MessageBox.Show(ex.Message + "\nPlease check request XPath syntax.", "Error", MessageBoxButton.OK); } } } finally { if (doc != null) { doc.close(); } } Mouse.OverrideCursor = null; }
string get_text_from_pdf_by_pdfbox(string path) { PDDocument pdffile = PDDocument.load(new java.io.File(path)); PDFTextStripper pdfStripper = new PDFTextStripper(); string text = pdfStripper.getText(pdffile); pdffile.close(); //File.WriteAllText(dst, text, Encoding.GetEncoding("GBK")); return(text); }
static private string[] PDFToTextPDFBox(string file, string[] split, StringSplitOptions option) { string pdftext = string.Empty; PDDocument doc = PDDocument.load(file); PDFTextStripper stripper = new PDFTextStripper(); pdftext = stripper.getText(doc); doc.close(); return(pdftext.Split(split, option)); }
public void pdfExtract(string keyWord, string path) { PDDocument doc = null; try { doc = PDDocument.load(path); if (doc.isEncrypted()) { doc.close(); return; } PDFTextStripper stripper = new PDFTextStripper(); bool isExist = stripper.getText(doc).ToLower().Contains(keyword); if (isExist) { pdfCount++; Task.Run(() => { Add_Item(Path.GetFileNameWithoutExtension(path), path, Path.GetExtension(path)); }); Thread.Sleep(200); } doc.close(); } catch { try { if (doc != null) { doc.close(); } } catch { return; } return; } }
public static void pdf2txt(string pdfName, string txtfileName) { PDDocument doc = PDDocument.load(pdfName); PDFTextStripper pdfStripper = new PDFTextStripper(); string text = pdfStripper.getText(doc); StreamWriter writer = new StreamWriter(txtfileName, false, Encoding.GetEncoding("gb2312")); writer.Write(text); writer.Close(); doc.close(); }
private string GetTextFromSimplePdf() { PDDocument doc = null; try { doc = PDDocument.load(_location.FullName); var stripper = new PDFTextStripper(); return(stripper.getText(doc)); } finally { doc?.close(); } }
/// <summary> /// 通过文件名解析PDF /// </summary> /// <param name="pdfFileName">PDF文件路径</param> /// <param name="tableContainType">表格包含样式</param> /// <returns></returns> public static PDFModel Parser(string pdfFileName, TableContainType tableContainType) { if (!System.IO.File.Exists(pdfFileName)) { return(null); } //打开文件 PDFModel fileContent = null; PDDocument reader = null; try { reader = PDDocument.load(new java.io.File(pdfFileName)); fileContent = Parser(reader, tableContainType); } catch (Exception ex) { if (reader != null) { reader.close(); reader = null; } return(null); } finally { if (reader != null) { reader.close(); reader = null; } } return(fileContent); }
public static string ReadPdfAsText(string filename) { PDDocument doc = null; try { doc = PDDocument.load(filename); var stripper = new PDFTextStripper(); return(stripper.getText(doc)); } finally { doc?.close(); } }
public void SplitByTopLevelBookmarks() { if (this.CheckOutput()) { PDFHelper.DisplayTrialPopupIfNecessary(); try { PDDocumentCatalog documentCatalog = this.pdfDocument.PDFBoxDocument.getDocumentCatalog(); PDDocumentOutline documentOutline = documentCatalog.getDocumentOutline(); if (documentOutline != null) { PDOutlineItem firstChild = documentOutline.getFirstChild(); PDPageTree pages = documentCatalog.getPages(); List <int> nums = new List <int>(); while (firstChild != null) { PDPage pDPage = firstChild.findDestinationPage(this.pdfDocument.PDFBoxDocument); nums.Add(pages.indexOf(pDPage)); firstChild = firstChild.getNextSibling(); } nums.Add(pages.getCount()); for (int i = 0; i < nums.Count - 1; i++) { int item = nums[i]; int num = nums[i + 1]; PDDocument pDDocument = new PDDocument(); for (int j = item; j < num; j++) { pDDocument.addPage(this.pdfDocument.PDFBoxDocument.getPage(j)); } pDDocument = PDFHelper.AddTrialStampIfNecessary(pDDocument); string str = string.Format("{0} [{1}].pdf", this.OutputFileName, i); pDDocument.save(Path.Combine(this.OutputFilePath, str)); pDDocument.close(); } } else { Console.WriteLine("This document does not contain any bookmarks."); } } catch (Exception exception1) { Exception exception = exception1; throw new PDFToolkitException(exception.Message, exception.InnerException); } } }
/// <summary> /// Reads the contents from PDF file specified in the file path /// </summary> /// <param name="filePath">PDF file path to read the contents</param> /// <returns>String text</returns> private string ReadPDF(string filePath) { PDDocument doc = null; try { doc = PDDocument.load(filePath); PDFTextStripper stripper = new PDFTextStripper(); return(stripper.getText(doc)); } finally { if (doc != null) { doc.close(); } } }
private static string parseUsingPDFBox(string input) { PDDocument doc = null; try { doc = PDDocument.load(input); PDFTextStripper stripper = new PDFTextStripper(); return(stripper.getText(doc)); } finally { if (doc != null) { doc.close(); } } }
private static string ExtractTextFromPdf(string path) { PDDocument doc = null; try { doc = PDDocument.load(path); PDFTextStripper stripper = new PDFTextStripper(); return(stripper.getText(doc)); } finally { if (doc != null) { doc.close(); } } }
/// <summary> /// Returns the text content of the specified PDF /// </summary> /// <param name="pdfFilePath">The file path of the pdf</param> public static string GetPDFText(string pdfFilePath) { PDDocument doc = null; try { doc = PDDocument.load(pdfFilePath); PDFTextStripper stripper = new PDFTextStripper(); return(stripper.getText(doc)); } finally { if (doc != null) { doc.close(); } } }
private static string ParseUsingPDFBox(string input)//Strips the text from a pdf, given a filepath. { PDDocument doc = null; try { doc = PDDocument.load(input); PDFTextStripper stripper = new PDFTextStripper(); return(stripper.getText(doc)); } finally { if (doc != null) { doc.close(); } } }
public string Parse(string fileName) { //Load in file. Using java.io because pdfbox is ported from java. var pdfFile = new FileInputStream(fileName); //Load file into the pdf parser var pdfParser = new PDFParser(pdfFile); //Parse the document, so that we can get it for the COSDocument pdfParser.parse(); /* COSDocument is the in-memory representation of the PDF. see https://pdfbox.apache.org/docs/1.8.4/javadocs/org/apache/pdfbox/cos/COSDocument.html */ var cosDocument = pdfParser.getDocument(); var pdDocument = new PDDocument(cosDocument); //Instantiate text stripper. var pdfTextStripper = new PDFTextStripper(); /* Needed for only stripping specific pages pdfTextStripper.setStartPage(0); pdfTextStripper.setEndPage(pdDocument.getNumberOfPages()); */ //Needed so that we can close the pdDocument before returning from this method var strippedText = pdfTextStripper.getText(pdDocument); //This closes all storage and delete the tmp files. pdDocument.close(); cosDocument.close(); return strippedText; }