public static String PDFText(String PDFFilePath) { PDDocument doc = PDDocument.load(PDFFilePath); PDFTextStripper stripper = new PDFTextStripper(); return(stripper.getText(doc)); }
public static Dictionary <int, string> Extract(string pdfFileName) { if (!File.Exists(pdfFileName)) { throw new FileNotFoundException("pdfFileName"); } var result = new Dictionary <int, string>(); PDDocument pdfDocument = PDDocument.load(pdfFileName); var pdfStripper = new PDFTextStripper(); pdfStripper.setPageSeparator(Environment.NewLine + Environment.NewLine); for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) { pdfStripper.setStartPage(i); pdfStripper.setEndPage(i); //ExtractText(pdfStripper, pdfDocument, // string.Format(@"c:\Users\tri.hoang\Desktop\temp\epub-belastingblad\2014-08\pdf\page_{0}.txt", i.ToString().PadLeft(5, '0'))); result.Add(i, GetText(pdfStripper, pdfDocument)); } pdfDocument.close(); return(result); }
/// <summary> /// 读取 /// </summary> /// <param name="file"></param> /// <returns></returns> public string Read(IFormFile file) { /*var ss = Directory.GetCurrentDirectory();//获取项目路径 * var st = _hostingEnvironment.ContentRootPath;//获取项目路径 * var fileDir = ss+"\\pdf"; * if (Directory.Exists(fileDir)) * { * Directory.Delete(fileDir,true); * } * Directory.CreateDirectory(fileDir); * //string fileName = file.FileName; * //string filePath= fileDir + $@"\{fileName}"; * var fileName = DateTime.Now.ToString("yyyyMMddHHmmss") + * Path.GetExtension(file.FileName); * var filePath = Path.Combine(fileDir, fileName); * using (FileStream fs = System.IO.File.Create(filePath)) * { * file.CopyTo(fs); * fs.Flush(); * } * var files = new FileInfo(filePath);*/ // string currentDirectory = Path.GetDirectoryName((new PdfController()).GetType().Assembly.Location); PDDocument doc = PDDocument.load(@"G:/Read.pdf"); PDFTextStripper pdfStripper = new PDFTextStripper(); string text = pdfStripper.getText(doc); return(text); }
/** * 提取部分页面文本 * @param file pdf文档路径 * @param startPage 开始页数 * @param endPage 结束页数 */ public static string ExtractTXT(String file, int startPage, int endPage) { String content = string.Empty; try { PDDocument document = PDDocument.load(file); //获取一个PDFTextStripper文本剥离对象 PDFTextStripper stripper = new PDFTextStripper(); // 设置按顺序输出 stripper.setSortByPosition(true); // 设置起始页 stripper.setStartPage(startPage); // 设置结束页 stripper.setEndPage(endPage); //获取文本内容 content = stripper.getText(document); document.close(); } catch (java.io.FileNotFoundException ex) { } catch (java.io.IOException ex) { } return(content); }
public string PdfToText() { string pdfText = String.Empty; PDFParser parser = new PDFParser(new BufferedInputStream(new FileInputStream(PdfFile))); parser.parse(); PDDocument originialPdfDoc = parser.getPDDocument(); bool isOriginalDocEncrypted = originialPdfDoc.isEncrypted(); if (isOriginalDocEncrypted) { originialPdfDoc.openProtection(new StandardDecryptionMaterial(PdfPassword)); } PDFTextStripper stripper = new PDFTextStripper(); try { pdfText = stripper.getText(originialPdfDoc); } catch (java.io.IOException ex) { throw ex; } return(pdfText); }
/// <summary> /// Method for extracting PDF data /// </summary> /// <param name="filename"></param> /// <returns></returns> public string ExtractTextFromPdf(string filename) { String text = ""; try{ if (checkFileExists(filename)) { _log.Info(filename + "exists in the download folder"); PDDocument doc = null; try{ doc = PDDocument.load(getPDFFilePath(filename)); PDFTextStripper stripper = new PDFTextStripper(); text = stripper.getText(doc); } catch (Exception e) { _log.Info("Exception in Extracting data from file " + filename + ".pdf" + e.StackTrace); _log.Info("Exception in Extracting data from file " + filename + ".pdf" + e.StackTrace); } finally{ if (doc != null) { doc.close(); } } } else { Assert.Fail("PDF file not found in 'Downloads' folder."); } } catch (Exception e) { _log.Info("Exception in extracting text from PDF: " + e.Message); } return(text); }
protected internal String ConvertPDFToDoc(string PDFpath) { try { PDDocument PDFdoc = null; PDFTextStripper textstrip = new PDFTextStripper(); String StringDocx = String.Empty; String DocxPath = String.Empty; PDFdoc = PDDocument.load(PDFpath); StringDocx = textstrip.getText(PDFdoc); PDFdoc.close(); //cierra el pdf ///DocxPath = fn.CreateFolderToSaveDocs(fn.GenerateName()); ///genera la ruta para guardar el archivo. DocxPath = fn.CreateFolderToSaveDocs(fileName); ///genera la ruta para guardar el archivo. var wordDoc = DocX.Create(DocxPath); wordDoc.InsertParagraph(StringDocx); wordDoc.Save(); ////Process.Start("winword.exe", DocxPath); return(DocxPath); } catch (Exception) { return(""); } }
public static String PDFText(String PDFFilePath) { PDDocument doc = PDDocument.load(PDFFilePath); PDFTextStripper stripper = new PDFTextStripper(); var text = " "; try { text = stripper.getText(doc); return(text); } catch (UnauthorizedAccessException e) { MessageBox.Show("Невозможно скопировать текст из Пдф " + PDFFilePath + ". " + e.Message, "Сообщение об ошибке"); return(""); } catch (FileLoadException FLe) { MessageBox.Show("Невозможно загрузить Пдф " + PDFFilePath + ". " + FLe.Message, "Сообщение об ошибке"); return(""); } catch when(text == "") { MessageBox.Show("Невозможно загрузить Пдф " + PDFFilePath + ". ", "Сообщение об ошибке"); return(""); } finally { doc.close(); } }
public string parseUsingPDFBox(string input) { PDDocument doc = PDDocument.load(input); PDFTextStripper stripper = new PDFTextStripper(); return(stripper.getText(doc)); }
public void ExtractText(string inpufFileName, string outputFileName) { #if false IFilterTextReader.FilterReader reader = new FilterReader(inpufFileName); var data = reader.ReadToEnd(); using (var writer = new StreamWriter(outputFileName, false, System.Text.Encoding.UTF8)) { writer.Write(data); } #else PDDocument doc = null; try { doc = PDDocument.load(inpufFileName); PDFTextStripper stripper = new PDFTextStripper(); using (var writer = new StreamWriter(outputFileName, false, System.Text.Encoding.UTF8)) { writer.Write(stripper.getText(doc)); } } finally { if (doc != null) { doc.close(); } } #endif }
private string GetTextFromPdfFile(string fileName) { PDDocument doc = PDDocument.load(fileName); PDFTextStripper stripper = new PDFTextStripper(); return(stripper.getText(doc)); }
/// <summary> /// Validates the PDF File Downloaded and the text present in the file /// </summary> /// <param name="pdfFileName">The folder or filepath.</param> /// <param name="textToCheck">The text to be validated.</param> /// <returns>if the text is present or not</returns> public static bool ExtractAndValidateTextFromPDF(string pdfFileName, string textToCheck) { try { string result = string.Empty; bool validateText = false; PDDocument doc = PDDocument.load(pdfFileName); PDFTextStripper stripper = new PDFTextStripper(); result = stripper.getText(doc); doc.close(); if (result.Length != 0) { byte[] bytes = Encoding.Default.GetBytes(result.ToString()); string decodedresult = Encoding.UTF8.GetString(bytes); string outputPDF = decodedresult.Replace(" ", null).Replace("\r", null).Replace("\n", null); string validationString = textToCheck.Replace(" ", null); validateText = outputPDF.Contains(validationString); } return(validateText); } catch (Exception) { throw; } }
static void Main(string[] args) { PDDocument doc = PDDocument.load("lopreacamasa.pdf"); PDFTextStripper pdfStripper = new PDFTextStripper(); Console.Write(pdfStripper.getText(doc)); }
private void btnShowPDF_Click(object sender, EventArgs e) { PDDocument PDF = PDDocument.load(textBox1.Text); PDFTextStripper stripper = new PDFTextStripper(); richTextBox1.Text = (stripper.getText(PDF)); }
private static void KamilPdfTest(string input) { PDDocument doc = null; try { doc = PDDocument.load(input); PDFTextStripper stripper = new PDFTextStripper(); // stripper.getText(doc); Matrix line = stripper.getTextLineMatrix(); // int page_nr = stripper.getCurrentPageNo(); PDPage page = stripper.getCurrentPage(); Matrix line2 = stripper.getTextMatrix(); int char_cnt = stripper.getTotalCharCnt(); string article_start = stripper.getArticleStart(); string article_end = stripper.getArticleEnd(); string pdf = stripper.getText(doc); // wrzuca caly tekst do sringa - dziala char_cnt = pdf.Length; } finally { if (doc != null) { doc.close(); } } }
public static Dictionary <int, string> Extract(string pdfFileName) { if (!File.Exists(pdfFileName)) { throw new FileNotFoundException("pdfFileName"); } var result = new Dictionary <int, string>(); PDDocument pdfDocument = PDDocument.load(pdfFileName); var pdfStripper = new PDFTextStripper(); pdfStripper.setPageSeparator(Environment.NewLine + Environment.NewLine); for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) { pdfStripper.setStartPage(i); pdfStripper.setEndPage(i); result.Add(i, GetText(pdfStripper, pdfDocument)); } pdfDocument.close(); return(result); }
/// <summary> /// Get text from the binary using PDFBox /// </summary> /// <returns>The text of the binary or null if we could not process the text</returns> public override string GetTextFromDocumentBinary() { string text = null; // If we have no bytes then we can't do anything. if (Bytes == null || Bytes.Length == 0) { // Log the problem. log.Error("Tried to extract text from empty bytes for file " + Name); return(null); } try { java.io.ByteArrayInputStream byteStream = new java.io.ByteArrayInputStream(Bytes); PDDocument doc = PDDocument.load(byteStream); PDFTextStripper stripper = new PDFTextStripper(); text = stripper.getText(doc); } catch (Exception e) { log.Error("Failed to get the text from the PDF file " + Name, e); } return(text); }
public string parsePDF(string filepath) { PDDocument document = PDDocument.load(filepath); PDFTextStripper stripper = new PDFTextStripper(); return(stripper.getText(document)); }
public string ParseFile(string path) { PDDocument doc = PDDocument.load(path); PDFTextStripper stripper = new PDFTextStripper(); string text = stripper.getText(doc); doc.close(); return text; }
private static string ReadPdf(string path) { PDDocument doc = PDDocument.load(path); PDFTextStripper stripper = new PDFTextStripper(); string text = stripper.getText(doc); return(text); }
public static string PdfFileReader(FileInfo fileName) { PDDocument doc = PDDocument.load(fileName.FullName); PDFTextStripper pdfStripper = new PDFTextStripper(); string text = pdfStripper.getText(doc); return(text); }
public string ParseFile(string path) { PDDocument doc = PDDocument.load(path); PDFTextStripper stripper = new PDFTextStripper(); string text = stripper.getText(doc); doc.close(); return(text); }
private string convertPdfToTxt(string rootDir) { DirectoryInfo root = new DirectoryInfo(rootDir); FileInfo[] files = root.GetFiles("*.pdf"); if (files.Length == 0) { MessageBox.Show("文件夹下不包含pdf文件."); } string bakFolderName = rootDir + "\\" + DateTime.Now.ToString("yyyyMMddHHmmss"); DirectoryInfo rootBak = new DirectoryInfo(bakFolderName); if (!rootBak.Exists) { rootBak.Create(); } SetPbValue(0); //1.pdf转txt for (int i = 0; i < files.Length; i++) { try { SetLableText(string.Format("pdf转txt.已处理{0},共{1},完成比例:{2}%", i + 1, files.Length, decimal.Round((Convert.ToDecimal(i + 1) / Convert.ToDecimal(files.Length) * 100), 2).ToString()), lblInfo); SetPbValue((int)decimal.Round((Convert.ToDecimal(i + 1) / Convert.ToDecimal(files.Length) * 100), 2)); FileInfo pdfFile = files[i]; string txtFilePath = bakFolderName + "\\" + pdfFile.Name.Substring(0, pdfFile.Name.Length - 4) + ".txt"; FileInfo txtFile = new FileInfo(txtFilePath); if (!txtFile.Exists) { FileStream stream = txtFile.Create(); stream.Close(); } PDDocument doc = PDDocument.load(pdfFile.FullName); PDFTextStripper pdfStripper = new PDFTextStripper(); string text = pdfStripper.getText(doc); StreamWriter swPdfChange = new StreamWriter(txtFile.FullName, false, Encoding.GetEncoding("gb2312")); swPdfChange.Write(text); swPdfChange.Close(); } catch (Exception) { continue; } } //this.txtTxtFolder.Text = bakFolderName; this.SetLableText(bakFolderName, txtTxtFolder); return(bakFolderName); }
public static String[] PdfToText(string path) { PDDocument doc = PDDocument.load(path); PDFTextStripper pdfStripper = new PDFTextStripper(); var text = pdfStripper.getText(doc).Split('\r'); return(text); }
private void button2_Click(object sender, EventArgs e) { PDDocument doc = PDDocument.load(textBox1.Text); PDFTextStripper striper = new PDFTextStripper(); richTextBox1.Text = (striper.getText(doc)); speechsynth.SelectVoiceByHints(VoiceGender.Female); speechsynth.SpeakAsync("" + richTextBox1.Text); }
/** * string getFilePath(string path) * { * // Specify the path to save the uploaded file to. * string savePath = "C:\\Users\\DR.AKUL\\Documents\\Visual Studio 2010\\Projects\\PlagijatorFinder\\PlagijatorFinder\\uploadFiles\\"; * * // Get the name of the file to upload. * string fileName = FileUpload1.FileName; * * // Create the path and file name to check for duplicates. * path = savePath + fileName + ".txt"; * return path; * } **/ private static string parseUsingPDFBox(string filename) { PDDocument doc = PDDocument.load(filename); PDFTextStripper stripper = new PDFTextStripper(); string text = stripper.getText(doc); doc.close(); return(text); }
private void button_Click(object sender, RoutedEventArgs e) { PDDocument doc = null; Mouse.OverrideCursor = Cursors.Wait; try { doc = PDDocument.load(Properties.Settings.Default.PdfPath); PDFTextStripper stripper = new PDFTextStripper(); string data = stripper.getText(doc); MatchCollection match = Regex.Matches(data, "FA\\d{8}", RegexOptions.IgnoreCase); if (match.Count > 0) { try { XmlDocument xml = new XmlDocument(); xml.Load(this.filePath.Text); xml.Save(this.filePath.Text + ".backup"); var manager = new XmlNamespaceManager(xml.NameTable); manager.AddNamespace("dat", "http://www.stormware.cz/schema/version_2/data.xsd"); int count = 0; foreach (var item in match) { string request = "/dat:dataPack/dat:dataPackItem[@id=\"" + item + "\"]"; try { foreach (XmlNode node in xml.SelectNodes(request, manager)) { node.ParentNode.RemoveChild(node); count++; } } catch (Exception ex) { } } xml.Save(this.filePath.Text); MessageBox.Show("Erased " + count + " items."); } catch (Exception ex) { MessageBox.Show(ex.Message + "\nPlease check request XPath syntax.", "Error", MessageBoxButton.OK); } } } finally { if (doc != null) { doc.close(); } } Mouse.OverrideCursor = null; }
static private string[] PDFToTextPDFBox(string file, string[] split, StringSplitOptions option) { string pdftext = string.Empty; PDDocument doc = PDDocument.load(file); PDFTextStripper stripper = new PDFTextStripper(); pdftext = stripper.getText(doc); doc.close(); return(pdftext.Split(split, option)); }
public string ExtractTextFromPdf(string filePath) { var doc = PDDocument.load(fullFilePath); //PSEDocument pd = new PSEDocument(); var stripper = new PDFTextStripper(); string pdfText = stripper.getText(doc).TrimEnd(); return(pdfText); }
public void TestReaderLastLine() { var stripper = new PDFTextStripper(); var reader = new PSEReportReader(stripper.getText(doc).TrimEnd()); string expected = "*** Grand total includes main,oddlot and block sale transactions"; string actual = reader.PSEReportString[reader.PSEReportString.Count - 1].Trim(); Assert.AreEqual(expected, actual); }
string get_text_from_pdf_by_pdfbox(string path) { PDDocument pdffile = PDDocument.load(new java.io.File(path)); PDFTextStripper pdfStripper = new PDFTextStripper(); string text = pdfStripper.getText(pdffile); pdffile.close(); //File.WriteAllText(dst, text, Encoding.GetEncoding("GBK")); return(text); }
public string GetStringFromPdfStream(Stream stream) { PDDocument doc = null; try { doc = PDDocument.load(new JavaIoWrapper(stream)); PDFTextStripper stripper = new PDFTextStripper(); return stripper.getText(doc); } finally { if (doc != null) { doc.close(); } } }
/// <summary> /// Parses a pdf item and returns its content as string. /// </summary> /// <param name="mediaItem">MediaItem (should be a pdf - otherwise an empty string will be returned).</param> /// <returns>String represantation of the pdf content.</returns> public static string ParsePdf(MediaItem mediaItem) { if (mediaItem.MimeType != PdfMimetype) return string.Empty; PDDocument doc = null; ikvm.io.InputStreamWrapper wrapper = null; try { var stream = mediaItem.GetMediaStream(); wrapper = new ikvm.io.InputStreamWrapper(stream); doc = PDDocument.load(wrapper); if (doc.isEncrypted()) { string[] pwArray = LoadPasswords(); doc = Decrypt(doc, pwArray); if (doc == null) { Log.Warn("PdfUtil :: ParsePDF :: Decryption Failed for: [" + mediaItem.Name + "]", typeof(PdfUtil)); return string.Empty; } else { Log.Debug("PdfUtil :: ParsePDF :: Successfully decrypted [" + mediaItem.Name + "]", typeof(PdfUtil)); } } var stripper = new PDFTextStripper(); return stripper.getText(doc); } catch (Exception ex) { Log.Error("PdfUtil :: ParsePDF :: Error parsing pdf: [" + mediaItem.Name + "]", ex); return string.Empty; } finally { if (doc != null) { doc.close(); wrapper.close(); } } }
/// <summary> /// Converts the PDF files to text files. /// </summary> public static void ConvertPDFFilesToTextFiles() { //Only convert PDF files var pdfFilesToConvert = Directory.GetFiles(FolderLocation).Where(f => f.Contains(".pdf")).ToList(); var txtFilesArray = pdfFilesToConvert.Where(f => f.Contains(".txt")).ToArray(); //Remove PDF files that already have text files pdfFilesToConvert.RemoveAll(pdfFile => txtFilesArray.Contains(pdfFile.Replace(".pdf", ".txt"))); //For testing, only do 1 //pdfFilesToConvert.RemoveRange(1, files.Count - 1); //Parallelize the PDF conversion pdfFilesToConvert.AsParallel().ForAll(filePath => { var doc = PDDocument.load(filePath); var pdfStripper = new PDFTextStripper(); var text = pdfStripper.getText(doc); File.WriteAllText(filePath.Replace(".pdf", ".txt"), text); }); }
public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK || propertyBag.Response == null) { return Task.FromResult(true); } PDDocument doc = null; try { doc = PDDocument.load(new ByteArrayInputStream(propertyBag.Response)); PDFTextStripper stripper = new PDFTextStripper(); propertyBag.Text = stripper.getText(doc); } finally { doc?.close(); } return Task.FromResult(true); }
public string Parse(string fileName) { //Load in file. Using java.io because pdfbox is ported from java. var pdfFile = new FileInputStream(fileName); //Load file into the pdf parser var pdfParser = new PDFParser(pdfFile); //Parse the document, so that we can get it for the COSDocument pdfParser.parse(); /* COSDocument is the in-memory representation of the PDF. see https://pdfbox.apache.org/docs/1.8.4/javadocs/org/apache/pdfbox/cos/COSDocument.html */ var cosDocument = pdfParser.getDocument(); var pdDocument = new PDDocument(cosDocument); //Instantiate text stripper. var pdfTextStripper = new PDFTextStripper(); /* Needed for only stripping specific pages pdfTextStripper.setStartPage(0); pdfTextStripper.setEndPage(pdDocument.getNumberOfPages()); */ //Needed so that we can close the pdDocument before returning from this method var strippedText = pdfTextStripper.getText(pdDocument); //This closes all storage and delete the tmp files. pdDocument.close(); cosDocument.close(); return strippedText; }
private ResultISBN parseISBNwithPDFBox(string filename) { try { PDDocument doc = PDDocument.load(filename); PDFTextStripper stripper = new PDFTextStripper(); // Split the search into parts (no need to search 10 pages // if the result is on the thrid stripper.setStartPage(0); stripper.setEndPage(3); string rezultat = stripper.getText(doc); string isbn = (new ISBN()).getISBNFromContent(rezultat); if (isbn != null) return (new ResultISBN(isbn, rezultat)); stripper = new PDFTextStripper(); stripper.setStartPage(3); stripper.setEndPage(7); rezultat = stripper.getText(doc); isbn = (new ISBN()).getISBNFromContent(rezultat); if (isbn != null) return (new ResultISBN(isbn, rezultat)); stripper = new PDFTextStripper(); stripper.setStartPage(7); stripper.setEndPage(10); rezultat = stripper.getText(doc); isbn = (new ISBN()).getISBNFromContent(rezultat); if (isbn != null) return (new ResultISBN(isbn, rezultat)); return (new ResultISBN(null, null)); } catch (Exception e) { // MessageBox.Show(e.Message); File.AppendAllText("log_Parser.txt", DateTime.Now.ToShortDateString() + " " + DateTime.Now.ToShortTimeString() + ": " + e.Message+" "+filename + Environment.NewLine); return (new ResultISBN(null, null)); } }
public string parsePDF(string filepath) { PDDocument document = PDDocument.load(filepath); PDFTextStripper stripper = new PDFTextStripper(); return stripper.getText(document); }
private void worker_DoWork(object sender, DoWorkEventArgs e) { try { PDDocument doc = PDDocument.load(_fileName); EmptyMethod step = StepForm; if (step != null) { form.Invoke(step); } PDFTextStripper stripper = new PDFTextStripper(); _result = stripper.getText(doc); _success = true; } catch { _success = false; } finally { EmptyMethod hide = HideForm; if (hide != null) { form.Invoke(hide); } } }
public override string ReadToEnd() { PDFTextStripper stripper = new PDFTextStripper(); return stripper.getText(document); }
public string ReadDocument() { PDFTextStripper pdfTextStripper = new PDFTextStripper(); return pdfTextStripper.getText(_pdfDocument); }
private string GetTextFromPdfFile(string fileName) { PDDocument doc = PDDocument.load(fileName); PDFTextStripper stripper = new PDFTextStripper(); return stripper.getText(doc); }
private static string ExtractTextFromPdf(string path) { PDDocument doc = null; try { doc = PDDocument.load(path); PDFTextStripper stripper = new PDFTextStripper(); return stripper.getText(doc); } finally { if (doc != null) { doc.close(); } } }
private void parsePDF() { PDDocument doc = PDDocument.load("2.pdf"); PDFTextStripper stripper = new PDFTextStripper(); //stripper.setSortByPosition(true); string text = stripper.getText(doc); stripper.getSeparateByBeads(); stripper.getTextLineMatrix(); Regex regex = new Regex("Сокращенное наименование (.*)\"", RegexOptions.Multiline | RegexOptions.IgnoreCase); var orgName = regex.Match(text).Groups[1].Value; regex = new Regex("ИНН (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var inn = regex.Match(text).Groups[1].Value; regex = new Regex("КПП (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var kpp = regex.Match(text).Groups[1].Value; regex = new Regex("Должность (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var post = regex.Match(text).Groups[1].Value; regex = new Regex("Фамилия (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var lastName = regex.Match(text).Groups[1].Value; regex = new Regex("Имя (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var firstName = regex.Match(text).Groups[1].Value; regex = new Regex("Отчество (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var surName = regex.Match(text).Groups[1].Value; regex = new Regex("ОГРН (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var ogrn = regex.Match(text).Groups[1].Value; regex = new Regex("Почтовый индекс (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var postIndex = regex.Match(text).Groups[1].Value; regex = new Regex("Субъект Российской Федерации (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var city = regex.Match(text).Groups[1].Value; regex = new Regex("Улица /(проспект, переулок и т.д./) (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var street = regex.Match(text).Groups[1].Value; regex = new Regex("Дом /(владение и т.п./) (.*)(\n[^0-9])?.*\r", RegexOptions.Multiline | RegexOptions.IgnoreCase); var house = regex.Match(text).Groups[1].Value; textBox4.Text = text; textBox3.Text = orgName + " / " +inn + " / " +kpp + " / " +post + " / " +lastName + " / " +firstName + " / " +surName +" / " + ogrn + " / " + postIndex + " / "+ city +" / "+street+" / "+house; //textBox3.Text = match[0].ToString(); //string[] lines = text.Split(new string[] { "\r?\n" }, StringSplitOptions.None); // give you all the lines separated by new line //string[] cols = lines[0].Split(new string[] { "\\s+ " }, StringSplitOptions.None); // gives array separated by whitespaces //textBox3.Text = cols[0].ToString(); //return stripper.getText(doc); }
//This method parses the pdf and returns a string with text content public static string ParseUsingPdfBox(string filename) { PDDocument doc; try { doc = PDDocument.load(filename); } catch { return null; } var sb = new StringBuilder(); var stripper = new PDFTextStripper(); var lastPage = stripper.getEndPage(); var lastPageMinus10 = lastPage - 10; stripper.setStartPage(1); stripper.setEndPage(10); string temp = stripper.getText(doc); sb.Append(temp); stripper.setStartPage(lastPageMinus10); stripper.setEndPage(lastPage); temp = stripper.getText(doc); sb.Append(temp); doc.close(); return sb.ToString(); }
private static string ParseUsingPDFBox(string inputFile, string outputFile) { var doc = PDDocument.load(inputFile); var stripper = new PDFTextStripper(); var result = stripper.getText(doc); if (string.IsNullOrEmpty(outputFile)) return result; using (var sw = new StreamWriter(outputFile)) { sw.WriteLine(result); } return string.Empty; }
private static string parseUsingPDFBox(string filename) { PDDocument doc = PDDocument.load(@filename); PDFTextStripper stripper = new PDFTextStripper(); return stripper.getText(doc); }