private string GetTextFromPDF(string path, int pagesToReturn) { try { var strategy = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy();//LocationTextExtractionStrategy(); StringBuilder text = new StringBuilder(); using (PdfReader reader = new PdfReader(path)) { if (pagesToReturn >= reader.NumberOfPages || pagesToReturn == -1) { for (int i = 1; i <= reader.NumberOfPages; i++) { text.Append(iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, i, strategy)); } } else { for (int i = 1; i <= pagesToReturn; i++) { text.Append(iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, i, strategy)); } } } return(text.ToString()); } catch (Exception e) { Log.AddMessage(e.ToString(), "Error"); Console.WriteLine(e.ToString() + " : " + e.InnerException); return(null); } }
public static bool ReadPdfText(string filepath, out string docText) { docText = string.Empty; try { string pdffilename = filepath; PdfReader pdfReader = new PdfReader(pdffilename); int numberOfPages = pdfReader.NumberOfPages; StringBuilder sbDocText = new StringBuilder(); for (int i = 1; i <= numberOfPages; ++i) { iTextSharp.text.pdf.parser.ITextExtractionStrategy strategy = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy(); sbDocText.Append(iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(pdfReader, i, strategy)); } sbDocText.Append(" "); pdfReader.Close(); docText = sbDocText.ToString(); } catch (Exception ex) { Console.WriteLine(ex.Message); return(false); } return(true); }
private string GetTableAsStringFromPDF(string PdfFileName, int startPage) { iTextSharp.text.pdf.PdfReader pdfReader = new iTextSharp.text.pdf.PdfReader(PdfFileName); string sOut = string.Empty; for (int i = startPage; i <= pdfReader.NumberOfPages; i++) { iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy its = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy(); sOut += iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(pdfReader, i, its); } return(sOut); }
public List <PayStub> GenerateIndividualStubs(string filename) { List <PayStub> stubs = new List <PayStub>(); using (var reader = new PdfReader(filename)) { for (var page = 1; page <= reader.NumberOfPages; page++) { //extract page text var strategy = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy(); var currentText = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, page, strategy); var paystub = FindPayStubMatch(currentText); if (paystub == null) { continue; } //write this individual page into a temp file var outFileName = System.IO.Path.GetTempPath() + Guid.NewGuid() + ".pdf"; var pdfDoc = new Document(reader.GetPageSizeWithRotation(page)); var stream = new MemoryStream(); var pdf = new PdfCopy(pdfDoc, stream); pdf.CloseStream = false; pdfDoc.Open(); pdf.AddPage(pdf.GetImportedPage(reader, page)); pdf.Close(); pdfDoc.Close(); stream.Position = 0; var stamper = new PdfStamper(new PdfReader(stream), new FileStream(outFileName, FileMode.Create)); if ((paystub.Password + "").Length > 0) { stamper.SetEncryption(Encoding.ASCII.GetBytes(paystub.Password), Encoding.ASCII.GetBytes(paystub.Password), PdfWriter.ALLOW_PRINTING, PdfWriter.ENCRYPTION_AES_128 | PdfWriter.DO_NOT_ENCRYPT_METADATA); } stamper.Close(); stream.Close(); paystub.Filename = outFileName; stubs.Add(paystub); } } return(stubs); }
public TDocs Process(FileInfo fi) { StringBuilder strBuilder = new StringBuilder(); PdfReader pdfReader = null; TDocs tdoc = new TDocs(); tdoc.Path = fi.FullName; tdoc.Name = fi.Name; tdoc.Extension = fi.Extension.ToLower(); tdoc.Title = tdoc.Name.Substring(0, tdoc.Name.LastIndexOf('.')); try { DateTime dtStart = DateTime.Now; pdfReader = new PdfReader(fi.FullName); int numberOfPages = pdfReader.NumberOfPages; int i; for (i = 1; i <= numberOfPages; ++i) { if (dtStart.AddMilliseconds(timeLimit) < DateTime.Now) { break; } iTextSharp.text.pdf.parser.ITextExtractionStrategy strategy = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy(); strBuilder.Append(iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(pdfReader, i, strategy)); } DateTime dtEnd = DateTime.Now; TimeSpan timeInter = dtEnd - dtStart; tdoc.Content = strBuilder.ToString(); } catch (Exception ex) { Debug.WriteLine(ex.Message); } finally { if (pdfReader != null) { pdfReader.Close(); } } return(tdoc); }
static void Main(string[] args) { ITextParse.ExtractText("employe-1.pdf", "iparse.txt"); //using (PdfReader reader = new PdfReader("letter.pdf")) //Index was outside the bounds of the array. //using (PdfReader reader = new PdfReader("employe-1.pdf")) //ok //using (PdfReader reader = new PdfReader("feuille_de_paie.pdf")) //Rebuild failed: trailer not found.; Original message: PDF startxref not found. using (iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader("modele-bulletin-de-salaire.pdf")) //ok { StringBuilder text = new StringBuilder(); for (int page = 1; page <= reader.NumberOfPages; page++) { iTextSharp.text.pdf.parser.ITextExtractionStrategy strategy = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy(); //ITextExtractionStrategy strategy = new LocationTextExtractionStrategy(); string currentText = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, page, strategy); currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); text.Append(currentText); } System.IO.StreamWriter file = new System.IO.StreamWriter("itextsharp.txt"); file.WriteLine(text); file.Close(); //return text.ToString(); } //PdfSharp using (var _document = PdfReader.Open("letter.pdf", PdfDocumentOpenMode.ReadOnly)) //ok //using (var _document = PdfReader.Open("employe-1.pdf", PdfDocumentOpenMode.ReadOnly)) //ok //using (var _document = PdfReader.Open("feuille_de_paie.pdf", PdfDocumentOpenMode.ReadOnly)) //Non-negative number required. //using (var _document = PdfReader.Open("modele-bulletin-de-salaire.pdf", PdfDocumentOpenMode.ReadOnly)) //ok { System.IO.StreamWriter file = new System.IO.StreamWriter("pdfsharp.txt"); foreach (PdfPage page in _document.Pages) { var text = ExtractText(page); foreach (string s in text) { file.Write(s); } } file.Close(); } //iTextSharp too PDFParser parser = new PDFParser(); //parser.ExtractText("letter.pdf", "pdfparser.txt"); //error parser.ExtractText("employe-1.pdf", "pdfparser.txt"); //ok //parser.ExtractText("feuille_de_paie.pdf", "pdfparser.txt"); //error //parser.ExtractText("modele-bulletin-de-salaire.pdf", "pdfparser.txt"); //error //PdfSharp //string text2 = PdfTextExtractor.GetText("letter.pdf"); //ok //string text2 = PdfTextExtractor.GetText("employe-1.pdf"); //ok //string text2 = PdfTextExtractor.GetText("feuille_de_paie.pdf"); //Non-negative number required. string text2 = PdfTextExtractor.GetText("modele-bulletin-de-salaire.pdf"); //ok System.IO.StreamWriter file2 = new System.IO.StreamWriter("PdfTextExtractor.txt"); file2.Write(text2); file2.Close(); }
private void InputDir(string srcPath) { string fullName, paperName, paperXiangduiPath, paperTextXiangduiPath; try { DirectoryInfo dir = new DirectoryInfo(srcPath); FileSystemInfo[] fileinfo = dir.GetFileSystemInfos(); //返回目录中所有文件和子目录 foreach (FileSystemInfo i in fileinfo) { if (i is DirectoryInfo) //判断是否文件夹 { DirectoryInfo subdir = new DirectoryInfo(i.FullName); //subdir.Delete(true); //删除子目录和文件 InputDir(subdir.FullName); } else { if (i.Extension == ".pdf" || i.Extension == ".PDF" || i.Extension == ".txt" || i.Extension == ".TXT") { fullName = i.FullName; paperName = i.Name.Substring(0, i.Name.Length - 4); //判断文章表中是否有该名称的文件 var data = (from wz in wz_dt where wz.文章名 == paperName select wz).ToList(); if (data.Count == 0) { //如果文章表中没有,则在文章表中创建新的记录 DataRow newRow; newRow = wz_dt.NewRow(); newRow["文章名"] = paperName; //将绝对路径改为相对路径 MatchCollection mc; List <Regex> regex_list = new List <Regex>(); regex_list.Add(new Regex("科学研究")); regex_list.Add(new Regex("社会科学")); regex_list.Add(new Regex("文学艺术")); regex_list.Add(new Regex("应用科学")); regex_list.Add(new Regex("哲学")); regex_list.Add(new Regex("自然科学")); for (int regex_int = 0; regex_int < regex_list.Count; regex_int++) { Regex r = regex_list[regex_int]; mc = r.Matches(fullName); if (mc.Count != 0) { int index = mc[0].Index; //正则表达式位置 int qian_location = 1; //正则表达式的前一字符位置 int hou_location; //正则表达式的后一字符位置 if (mc[0].Value == "哲学") { hou_location = 2; } else { hou_location = 4; } int houZhui_length = 4; //扩展名长度 char qian = fullName[index - qian_location]; char hou = fullName[index + hou_location]; if ((qian == '/' || qian == '\\') & (hou == '/' || hou == '\\')) { if (i.Extension == ".pdf" || i.Extension == ".PDF") { paperXiangduiPath = ".\\" + fullName.Substring(index); newRow["文件"] = paperXiangduiPath; } string hh = fullName.Substring(index); string kk = hh.Substring(0, hh.Length - houZhui_length); paperTextXiangduiPath = ".\\" + kk + ".txt"; newRow["text文件"] = paperTextXiangduiPath; wz_dt.Rows.Add(newRow); wz_ta.Update(wz_dt); //将pdf文件内容写入txt文件 if (i.Extension == ".pdf" || i.Extension == ".PDF") { string text = ""; try { string pdffilename = fullName; PdfReader pdfReader = new PdfReader(pdffilename); int numberOfPages = pdfReader.NumberOfPages; text = string.Empty; for (int j = 1; j <= numberOfPages; ++j) { iTextSharp.text.pdf.parser.ITextExtractionStrategy strategy = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy(); text += iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(pdfReader, j, strategy); } pdfReader.Close(); } catch (Exception ex) { StreamWriter wlog = File.AppendText(System.AppDomain.CurrentDomain.SetupInformation.ApplicationBase + "\\mylog.log"); wlog.WriteLine("出错文件:" + "原因:" + ex.ToString()); wlog.Flush(); wlog.Close(); } string ss = fullName.Substring(0, fullName.Length - houZhui_length) + ".txt"; StreamWriter fileWriter = new StreamWriter(ss, true); fileWriter.Write(text); fileWriter.Close(); } } } } } } } } } catch (Exception e) { MessageBox.Show(e.Message); } }
public static bool CombineBriefPages_AddingBlanks( List <CockleFilePdf> srcFiles, string src, TypeOfBindEnum bind) { // new attempt Dec 28, to account for divider pages (or any page without text) // text has to start on odd-numbered page, if followed by divider page // first, add 2 pages for each divider page, to account for front and back. // then, when everything is together, cycle through doc to add extra dividers... // ... so that text always falls on odd-numbered page // should work for both Saddle Stitch and Perfect Bind // create new list without cover, ordered by rank List <CockleFilePdf> files = new List <CockleFilePdf>( srcFiles .Where(f => f.FileType != SourceFileTypeEnum.Cover) .Where(f => f.FileType != SourceFileTypeEnum.InsideCv) .Where(f => f.FileType != SourceFileTypeEnum.SidewaysPage) .Where(f => f.FileType != SourceFileTypeEnum.Brief_Foldout) .Where(f => f.FileType != SourceFileTypeEnum.Brief_ZFold) .Where(f => f.FileType != SourceFileTypeEnum.App_Foldout) .Where(f => f.FileType != SourceFileTypeEnum.App_ZFold) .Where(f => f.FileType != SourceFileTypeEnum.Unrecognized) .OrderBy(f => f.Rank)); if (files.Count < 1) { return(false); } // what if files.Count == 1 ??? just return ??? int pageCount = 0; bool hasDividers = false; bool firstAppFileFound = false; int firstPageOfApp = -1; try { using (var stream = new System.IO.FileStream(src, System.IO.FileMode.Create)) { // initiate iTextSharp processes iTextSharp.text.Document pdfdoc = new iTextSharp.text.Document(iTextSharp.text.PageSize.LETTER); iTextSharp.text.pdf.PdfCopy pdfcopy = new iTextSharp.text.pdf.PdfCopy(pdfdoc, stream); pdfdoc.Open(); // merge pdfs in folder CockleFilePdf f; for (int i = 0; i < files.Count; i++) { f = files[i]; // read file iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader(f.FullName); int filePageCount = reader.NumberOfPages; // set up pdfstamper iTextSharp.text.pdf.PdfStamper stamper = new iTextSharp.text.pdf.PdfStamper(reader, stream); // look for divider pages here, add blank if exists List <int> divider_pages = new List <int>(); iTextSharp.text.pdf.parser.PdfReaderContentParser parser = new iTextSharp.text.pdf.parser.PdfReaderContentParser(reader); for (int j = 1; j <= reader.NumberOfPages; j++) { iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy extract = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy(); var extractedText = parser.ProcessContent(j, extract); string textFromPage = extractedText.GetResultantText(); int cnt = textFromPage.ToCharArray().Count(); int mch_cnt = System.Text.RegularExpressions.Regex.Matches(textFromPage, @"A(PPENDIX|ppendix)").Count; if (System.Text.RegularExpressions.Regex.Matches(textFromPage, @"\S").Count == 0) { // collect blank pages divider_pages.Add(j); } else if (cnt < 50 && mch_cnt > 0) { // collect other divider pages divider_pages.Add(j); } } if (divider_pages.Count > 0) { hasDividers = true; int k = 0; // adjust for total page number change foreach (int page in divider_pages) { stamper.InsertPage(page + k, reader.GetPageSizeWithRotation(1)); filePageCount = reader.NumberOfPages; k++; } } // add blank page if needed to make even number if (files[i].FileType == SourceFileTypeEnum.Index || files[i].FileType == SourceFileTypeEnum.Brief || files[i].FileType == SourceFileTypeEnum.App_Index || files[i].FileType == SourceFileTypeEnum.Motion || files[i].FileType == SourceFileTypeEnum.Divider_Page) { f.AssignNeedsBlankPage(files, reader.NumberOfPages); if (f.NeedsBlankPage) { //PdfStamper stamper2 = new PdfStamper(reader, stream); stamper.InsertPage(reader.NumberOfPages + 1, reader.GetPageSizeWithRotation(1)); filePageCount = reader.NumberOfPages; } } // with last document in 'files', add extra pages to make divisible by 4 if (bind == TypeOfBindEnum.SaddleStitch && i == files.Count - 1) { if (bind == TypeOfBindEnum.SaddleStitch && (pageCount + reader.NumberOfPages) % 4 != 0) { //PdfStamper stamper3 = new PdfStamper(reader, stream); while ((pageCount + reader.NumberOfPages) % 4 != 0) { stamper.InsertPage(reader.NumberOfPages + 1, reader.GetPageSizeWithRotation(1)); } } } // get first page of first app file if (!firstAppFileFound && files[i].FileType == SourceFileTypeEnum.App_File) { firstAppFileFound = true; firstPageOfApp = pageCount + 1; } // add document to 'src' pdfcopy.AddDocument(new iTextSharp.text.pdf.PdfReader(reader)); pageCount += reader.NumberOfPages; } pdfcopy.Close(); pdfdoc.CloseDocument(); } // final cycle, if dividers, to make sure text starts on odd-sided pages if (bind == TypeOfBindEnum.PerfectBind && hasDividers) { string dest = (System.IO.Path.GetDirectoryName(src) + @"\temp " + DateTime.Now.ToString("ddMMyyyyhhmmssffff")); using (var stream = new System.IO.FileStream(dest, System.IO.FileMode.Create)) { iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader(src); iTextSharp.text.pdf.PdfStamper stamper = new iTextSharp.text.pdf.PdfStamper(reader, stream); // get all blank pages in appendix iTextSharp.text.pdf.parser.PdfReaderContentParser parser = new iTextSharp.text.pdf.parser.PdfReaderContentParser(reader); List <List <int> > groupsOfBlanks = new List <List <int> >(); List <int> group_list = new List <int>(); int x; for (x = firstPageOfApp; x <= reader.NumberOfPages; x++) { iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy extract = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy(); var extractedText = parser.ProcessContent(x, extract); string textFromPage = extractedText.GetResultantText(); // find blank pages and cluster into group_list if (System.Text.RegularExpressions.Regex.Matches(textFromPage, @"\S").Count == 0) { // capture blank page cluster (??? but what if only 1 page ???) if (group_list.Count == 0 || group_list.Contains(x - 1)) { group_list.Add(x); } } else { // find first page after cluster if (group_list.Count > 0) { if (group_list.Last() % 2 == 1) { // add blank page stamper.InsertPage(group_list.Last() + 1, reader.GetPageSizeWithRotation(1)); } } // clear list group_list.Clear(); } } stamper.Close(); reader.Close(); } System.IO.File.Delete(src); System.IO.File.Move(dest, src); } } catch (Exception excpt) { System.Diagnostics.Debug.WriteLine(excpt); return(false); } return(true); }
public static void SplitPdf(string pdfPath, int step, string outPath = "") { var sw = new Stopwatch(); sw.Start(); string path = outPath == "" ? "" : outPath + "\\"; var files = Directory.EnumerateFiles(pdfPath, "*.pdf"); if (files.Count() == 0) { throw new Exception("По указанному пути PDF файлы не найдены."); } foreach (var file in files) { using (var reader = new PdfReader(file)) { if (reader.NumberOfPages % 2 != 0) { throw new Exception("В документе нечетное количество страниц"); } for (int i = 1; i <= reader.NumberOfPages; i += step) { string barcode = string.Empty; using (var fs = new FileStream($"{path}{i}.pdf", FileMode.Create)) { var doc = new Document(); var copy = new PdfCopy(doc, fs); doc.Open(); var matches = new List <string>(); for (int j = 0; j < step; j++) { var strategy = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy(); var parsedText = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, i + j); Match firstPageMatches; MatchCollection secondPageMatch; if (j == 1) { secondPageMatch = Regex.Matches(parsedText, @"18810154\d{12,12}"); foreach (Match item in secondPageMatch) { matches.Add(item.Value); } } else { firstPageMatches = Regex.Match(parsedText, @"18810154\d{12,12}"); if (firstPageMatches.Success) { barcode = firstPageMatches.Value; matches.Add(firstPageMatches.Value); } } copy.AddPage(copy.GetImportedPage(reader, i + j)); } if (matches.Count < 2) { throw new Exception($"В {Path.GetFileName(file)} около страницы {i} не найден один из штрихкодов."); } if (!matches.GetRange(1, matches.Count - 1).Contains(matches.First())) { throw new Exception($"Файл: {Path.GetFileName(file)} Штрихкоды не совпадают около страницы {i}."); } doc.Close(); matches.Clear(); } if (File.Exists($"{path}{i}.pdf")) { if (File.Exists($"{path}{barcode}.pdf")) { File.Delete($"{path}{barcode}.pdf"); } File.Move($"{path}{i}.pdf", $"{path}{barcode}.pdf"); } } } } sw.Stop(); Console.WriteLine($"Elapsed: {sw.ElapsedMilliseconds} ms."); }