/// <summary>Extracts all images (of types that iTextSharp knows how to decode) from a PDF file.</summary> public static Dictionary<string, System.Drawing.Image> ExtractImages(string filename) { var images = new Dictionary<string, System.Drawing.Image>(); using (var reader = new PdfReader(filename)) { var parser = new PdfReaderContentParser(reader); ImageRenderListener listener = null; for (var i = 1; i <= reader.NumberOfPages; i++) { parser.ProcessContent(i, (listener = new ImageRenderListener())); var index = 1; if (listener.Images.Count > 0) { Console.WriteLine("Found {0} images on page {1}.", listener.Images.Count, i); foreach (var pair in listener.Images) { images.Add(string.Format("{0}_Page_{1}_Image_{2}{3}", System.IO.Path.GetFileNameWithoutExtension(filename), i.ToString("D4"), index.ToString("D4"), pair.Value), pair.Key); index++; } } } return images; } }
/// <summary> /// Parses a page of a PDF file resulting in a list of /// </summary> /// <param name="reader">a PdfReader</param> /// <param name="page">the page number of the page that needs to be parsed</param> /// <param name="header_height">the height of the top margin</param> /// <returns>a list of TextItem and ImageItem objects</returns> public List<MyItem> GetContentItems(PdfReader reader, int page, float header_height) { PdfReaderContentParser parser = new PdfReaderContentParser(reader); MyRenderListenerSimple myRenderListener = new MyRenderListenerSimple(); parser.ProcessContent(page, myRenderListener); return myRenderListener.Items; }
virtual public void TestWithMultiFilteredRenderListener() { PdfReader pdfReader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "test.pdf"); PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader); float x1, y1, x2, y2; MultiFilteredRenderListener listener = new MultiFilteredRenderListener(); x1 = 122; x2 = 144; y1 = 841.9f - 151; y2 = 841.9f - 163; ITextExtractionStrategy region1Listener = listener.AttachRenderListener( new LocationTextExtractionStrategy(), new RegionTextRenderFilter(new Rectangle(x1, y1, x2, y2))); x1 = 156; x2 = 169; y1 = 841.9f - 151; y2 = 841.9f - 163; ITextExtractionStrategy region2Listener = listener.AttachRenderListener( new LocationTextExtractionStrategy(), new RegionTextRenderFilter(new Rectangle(x1, y1, x2, y2))); parser.ProcessContent(1, new GlyphRenderListener(listener)); Assert.AreEqual("Your", region1Listener.GetResultantText()); Assert.AreEqual("dju", region2Listener.GetResultantText()); }
public static PdfExtractionResult PDF_ExportImage(string filename, string dirForExtractions, int divider, bool checkResult, bool joinImages) { var details = new PdfExtractionResult(); DataAccess.Instance.g_curProgress = 0; evnt_UpdateCurBar(); var imagesList = new Dictionary<PageImageIndex, Image>(); // Ask itextsharp to extract image var pdfReader = new PdfReader(filename); var pdfParser = new PdfReaderContentParser(pdfReader); var pdfListener = new PDFImageListener(dirForExtractions); double tem0 = divider; double pgc = pdfReader.NumberOfPages; double CurOneStep = (double)(tem0 / pgc); details.Pages = (int)pgc; for (int i = 1; i <= pgc; i++) { pdfListener.PageIndex = i; // itextsharp send response to listener pdfParser.ProcessContent(i, pdfListener); DataAccess.Instance.g_curProgress += CurOneStep; evnt_UpdateCurBar(); } imagesList = pdfListener.ImagesList; details.ImagesBeforeMerge = pdfListener.ImagesList.Count; details.ImagesAfterMerge = details.ImagesBeforeMerge; if (checkResult && pdfReader.NumberOfPages != details.ImagesBeforeMerge) { if (joinImages) { ImageJoiner cp = new ImageJoiner(); imagesList = cp.Merge(pdfListener.ImagesList, dirForExtractions); } details.ImagesAfterMerge = imagesList.Count; if(pdfReader.NumberOfPages != imagesList.Count) { //Directory.Delete(dirForExtractions, true); //throw new Exception(string.Format("Error extracting {0} : {1} images for {2} pages", Path.GetFileName(filename), pdfListener.ImagesList.Count, pdfReader.NumberOfPages)); } } if (pdfReader != null) pdfReader.Close(); // Write images to disk (because of memory problem write directly to file now) //WriteImages(dirForExtractions, imagesList); return details; }
/// <summary> /// Parses a page of a PDF file resulting in a list of /// TextItem and ImageItem objects. /// </summary> /// <param name="reader">a PdfReader</param> /// <param name="page">the page number of the page that needs to be parsed</param> /// <param name="header_height">header_height the height of the top margin</param> /// <returns>a list of TextItem and ImageItem objects</returns> public List<MyItem> GetContentItems(PdfReader reader, int page, float header_height) { PdfReaderContentParser parser = new PdfReaderContentParser(reader); Rectangle pageSize = reader.GetPageSize(page); MyRenderListener myRenderListener = new MyRenderListener(pageSize.Top - header_height); parser.ProcessContent(page, myRenderListener); return myRenderListener.Items; }
public virtual void TestCharacterRenderInfos() { byte[] bytes = CreateSimplePdf(PageSize.LETTER.Rotate().Rotate(), "ABCD"); //TestResourceUtils.saveBytesToFile(bytes, new File("C:/temp/out.pdf")); PdfReader r = new PdfReader(bytes); PdfReaderContentParser parser = new PdfReaderContentParser(r); parser.ProcessContent(FIRST_PAGE, new CharacterPositionRenderListener()); }
/// <summary>Checks whether a specified page of a PDF file contains images.</summary> /// <returns>True if the page contains at least one image; false otherwise.</returns> public static bool PageContainsImages(string filename, int pageNumber) { using (var reader = new PdfReader(filename)) { var parser = new PdfReaderContentParser(reader); ImageRenderListener listener = null; parser.ProcessContent(pageNumber, (listener = new ImageRenderListener())); return listener.Images.Count > 0; } }
public static void ProcessContentPage(PdfReader reader, int page, Test_iTextSharp.ITextExtractionStrategy strategy) { PdfReaderContentParser parser = new PdfReaderContentParser(reader); PdfDictionary pageDic = reader.GetPageN(page); PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES); Test_iTextSharp.PdfContentStreamProcessor processor = new Test_iTextSharp.PdfContentStreamProcessor(strategy); byte[] bytes = ContentByteUtils.GetContentBytesForPage(reader, page); processor.ProcessContent(bytes, resourcesDic); }
virtual public void Test2() { PdfReader pdfReader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "Sample.pdf"); PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader); String extractedText = parser.ProcessContent(1, new GlyphTextRenderListener(new FilteredTextRenderListener(new LocationTextExtractionStrategy(), new RegionTextRenderFilter(new Rectangle(111, 855, 136, 867))))).GetResultantText(); Assert.AreEqual("Your ", extractedText); }
// --------------------------------------------------------------------------- public void Write(Stream stream) { using (ZipFile zip = new ZipFile()) { zip.AddFile(PREFACE, ""); PdfReader reader = new PdfReader(PREFACE); PdfReaderContentParser parser = new PdfReaderContentParser(reader); StringBuilder sb = new StringBuilder(); for (int i = 1; i <= reader.NumberOfPages; i++) { sb.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i)); } zip.AddEntry(RESULT, sb.ToString()); zip.Save(stream); } }
// --------------------------------------------------------------------------- public void Write(Stream stream) { using (ZipFile zip = new ZipFile()) { zip.AddFile(PREFACE, ""); PdfReader reader = new PdfReader(PREFACE); PdfReaderContentParser parser = new PdfReaderContentParser(reader); StringBuilder sb = new StringBuilder(); ITextExtractionStrategy strategy; for (int i = 1; i <= reader.NumberOfPages; i++) { strategy = parser.ProcessContent(i, new LocationTextExtractionStrategy()); sb.AppendLine(strategy.GetResultantText()); } zip.AddEntry(RESULT, sb.ToString()); zip.Save(stream); } }
virtual public void Test1() { PdfReader pdfReader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "test.pdf"); PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader); float x1, y1, x2, y2; x1 = 203; x2 = 224; y1 = 842 - 44; y2 = 842 - 93; String extractedText = parser.ProcessContent(1, new GlyphTextRenderListener(new FilteredTextRenderListener(new LocationTextExtractionStrategy(), new RegionTextRenderFilter(new Rectangle(x1, y1, x2, y2))))).GetResultantText(); Assert.AreEqual("1234\nt5678", extractedText); }
// =========================================================================== public void Write(Stream stream) { ImageTypes it = new ImageTypes(); using (ZipFile zip = new ZipFile()) { byte[] pdf = it.CreatePdf(); zip.AddEntry(Utility.ResultFileName(it.ToString() + ".pdf"), pdf); PdfReader reader = new PdfReader(pdf); PdfReaderContentParser parser = new PdfReaderContentParser(reader); MyImageRenderListener listener = new MyImageRenderListener(); for (int i = 1; i <= reader.NumberOfPages; i++) { parser.ProcessContent(i, listener); } for (int i = 0; i < listener.MyImages.Count; ++i) { zip.AddEntry( listener.ImageNames[i], listener.MyImages[i] ); } zip.Save(stream); } }
private void ParseAndHighlight(String input, String output, bool singleCharacters) { PdfReader reader = new PdfReader(input); FileStream fos = new FileStream(output, FileMode.Create); PdfStamper stamper = new PdfStamper(reader, fos); PdfReaderContentParser parser = new PdfReaderContentParser(reader); MyRenderListener myRenderListener = singleCharacters ? new MyCharacterRenderListener() : new MyRenderListener(); for (int pageNum = 1; pageNum <= reader.NumberOfPages; pageNum++) { List<Rectangle> rectangles = parser.ProcessContent(pageNum, myRenderListener).GetRectangles(); PdfContentByte canvas = stamper.GetOverContent(pageNum); canvas.SetLineWidth(0.5f); canvas.SetColorStroke(BaseColor.RED); foreach (Rectangle rectangle in rectangles) { canvas.Rectangle(rectangle.Left, rectangle.Bottom, rectangle.Width, rectangle.Height); canvas.Stroke(); } } stamper.Close(); fos.Close(); reader.Close(); }
// --------------------------------------------------------------------------- public void Write(Stream stream) { using (ZipFile zip = new ZipFile()) { zip.AddFile(PREFACE, ""); PdfReader reader = new PdfReader(PREFACE); PdfReaderContentParser parser = new PdfReaderContentParser(reader); using (MemoryStream ms = new MemoryStream()) { using (PdfStamper stamper = new PdfStamper(reader, ms)) { TextMarginFinder finder; for (int i = 1; i <= reader.NumberOfPages; i++) { finder = parser.ProcessContent(i, new TextMarginFinder()); PdfContentByte cb = stamper.GetOverContent(i); cb.Rectangle( finder.GetLlx(), finder.GetLly(), finder.GetWidth(), finder.GetHeight() ); cb.Stroke(); } } zip.AddEntry(RESULT, ms.ToArray()); } zip.Save(stream); } }
private void openFile(string fileName) { string[] fileSplit = fileName.Split('.'); if (fileSplit[fileSplit.Length - 1] == "docx") { DocxToText dtt = new DocxToText(fileName); richTextBoxEditor.Text = dtt.ExtractText(); } else if (fileSplit[fileSplit.Length - 1] == "doc") { MessageBox.Show("Tyvärr stödjer inte programmet det gamla wordformatet (.doc). Prova med att spara om det till det nya formatet (.docx), eller som en textfil", "Fel filformat", MessageBoxButtons.OK, MessageBoxIcon.Stop); } else if(fileSplit[fileSplit.Length - 1] == "rtf") { richTextBoxEditor.LoadFile(fileName, RichTextBoxStreamType.RichText); } else if (fileSplit[fileSplit.Length - 1] == "pdf") { richTextBoxEditor.Clear(); PdfReader pdfread = new PdfReader(fileName); PdfReaderContentParser pdfparser = new PdfReaderContentParser(pdfread); ITextExtractionStrategy strategy; for (int i = 1; i <= pdfread.NumberOfPages; i++) { strategy = pdfparser.ProcessContent(i, new SimpleTextExtractionStrategy()); richTextBoxEditor.Text += strategy.GetResultantText(); } } else if (fileSplit[fileSplit.Length - 1] == "txt") { richTextBoxEditor.LoadFile(fileName, RichTextBoxStreamType.PlainText); } else { richTextBoxEditor.LoadFile(fileName, RichTextBoxStreamType.PlainText); } textToolStripMenuItem.Enabled = false; punktToolStripMenuItem.Enabled = false; textToolStripMenuItem.Checked = false; punktToolStripMenuItem.Checked = false; }
public static MemoryStream AdicionarDataHoraControleAcesso(Stream pdf, int tituloModeloCodigo) { //Ignora PASSWORD de PDFs protegidos PdfReader.unethicalreading = true; BaseColor corTexto = BaseColor.BLACK; MemoryStream ms = new MemoryStream(); PdfImportedPage page; PdfReader reader = new PdfReader(pdf); Document doc = new Document(reader.GetPageSizeWithRotation(1)); PdfContentByte cb = null; PdfWriter wrt = PdfWriter.GetInstance(doc, ms); wrt.PageEvent = null; doc.Open(); PdfReaderContentParser parser = new PdfReaderContentParser(reader); #region Páginas do Pdf for (int i = 1; i <= reader.NumberOfPages; i++) { doc.SetPageSize(reader.GetPageSizeWithRotation(i)); doc.NewPage(); cb = wrt.DirectContentUnder; cb.SaveState(); cb.SetColorFill(corTexto); cb.BeginText(); cb.SetFontAndSize(arial16.BaseFont, 5); page = wrt.GetImportedPage(reader, i); float x1, y, x2; y = x1 = x2 = 0f; switch (tituloModeloCodigo) { case 19 /*Certificado de Registro de Atividade Florestal*/: y = doc.PageSize.Bottom + doc.BottomMargin * 3 - 10f; x1 = doc.PageSize.Width / 4; x2 = (doc.PageSize.Width / 4) * 3; break; case 20 /*Licença de Porte e Uso de Motosserra*/: y = doc.PageSize.Bottom + doc.BottomMargin * 3 - 15f; x1 = doc.PageSize.Width / 4; x2 = (doc.PageSize.Width / 4) * 3 - 20f; break; default: break; } string texto = DateTime.Now.ToString("dd/M/yyyy H:mm:ss"); switch (doc.PageSize.Rotation) { case 0: cb.ShowTextAligned(Element.ALIGN_LEFT, texto, x1, y, 0); //Rodape cb.ShowTextAligned(Element.ALIGN_LEFT, texto, x2, y, 0); //Rodape cb.AddTemplate(page, 1f, 0, 0, 1f, 0, 0); break; case 90: cb.ShowTextAligned(Element.ALIGN_LEFT, texto, x1, y, 0); //Rodape cb.ShowTextAligned(Element.ALIGN_LEFT, texto, x2, y, 0); //Rodape cb.AddTemplate(page, 0, -1f, 1f, 0, 0, doc.PageSize.Height); break; case 180: cb.ShowTextAligned(Element.ALIGN_LEFT, texto, x1, y, 0); //Rodape cb.ShowTextAligned(Element.ALIGN_LEFT, texto, x2, y, 0); //Rodape cb.AddTemplate(page, -1f, 0, 0, -1f, doc.PageSize.Width, doc.PageSize.Height); break; case 270: cb.ShowTextAligned(Element.ALIGN_LEFT, texto, x1, y, 0); //Rodape cb.ShowTextAligned(Element.ALIGN_LEFT, texto, x2, y, 0); //Rodape cb.AddTemplate(page, 0, 1.0F, -1.0F, 0, doc.PageSize.Width, 0); break; } cb.EndText(); cb.RestoreState(); cb.SaveState(); cb.ResetGrayFill(); cb.RestoreState(); } #endregion doc.Close(); MemoryStream msOut = new MemoryStream(ms.ToArray()); ms.Close(); ms.Dispose(); return(msOut); }
public static PdfExtractionResult PDF_ExportImage(string filename, string dirForExtractions, int divider, bool checkResult, bool joinImages, bool osx) { var details = new PdfExtractionResult(); DataAccess.Instance.g_curProgress = 0; evnt_UpdateCurBar(); var imagesList = new Dictionary <PageImageIndex, Image>(); if (osx) { dirForExtractions = dirForExtractions + "/"; } // Ask itextsharp to extract image var pdfReader = new PdfReader(filename); var pdfParser = new PdfReaderContentParser(pdfReader); var pdfListener = new PDFImageListener(dirForExtractions); double tem0 = divider; double pgc = pdfReader.NumberOfPages; double CurOneStep = (double)(tem0 / pgc); details.Pages = (int)pgc; for (int i = 1; i <= pgc; i++) { pdfListener.PageIndex = i; // itextsharp send response to listener pdfParser.ProcessContent(i, pdfListener); DataAccess.Instance.g_curProgress += CurOneStep; evnt_UpdateCurBar(); } imagesList = pdfListener.ImagesList; details.ImagesBeforeMerge = pdfListener.ImagesList.Count; details.ImagesAfterMerge = details.ImagesBeforeMerge; if (checkResult && pdfReader.NumberOfPages != details.ImagesBeforeMerge) { if (joinImages) { ImageJoiner cp = new ImageJoiner(); imagesList = cp.Merge(pdfListener.ImagesList, dirForExtractions); } details.ImagesAfterMerge = imagesList.Count; if (pdfReader.NumberOfPages != imagesList.Count) { //Directory.Delete(dirForExtractions, true); //throw new Exception(string.Format("Error extracting {0} : {1} images for {2} pages", Path.GetFileName(filename), pdfListener.ImagesList.Count, pdfReader.NumberOfPages)); } } if (pdfReader != null) { pdfReader.Close(); } // Write images to disk (because of memory problem write directly to file now) //WriteImages(dirForExtractions, imagesList); return(details); }
public MemoryStream AddVARLicenceBlocks(MemoryStream inputStream) { PdfReader reader = new PdfReader((byte[])inputStream.ToArray()); MemoryStream outputStream = new MemoryStream(); using (Document document = new Document(reader.GetPageSizeWithRotation(1), 0, 0, 0, 0)) { using (PdfWriter writer = PdfWriter.GetInstance(document, outputStream)) { document.Open(); PdfImportedPage importedPage = writer.GetImportedPage(reader, 1); var pageRotation = reader.GetPageRotation(1); var pageWidth = reader.GetPageSizeWithRotation(1).Width; var pageHeight = reader.GetPageSizeWithRotation(1).Height; var titlefont = FontFactory.GetFont(BaseFont.COURIER, 7, Font.NORMAL); var title1 = new Paragraph(20, "Reproduced from Admiralty digital Notices to Mariners by permission of the Controller of Her Majesty’s Stationery", titlefont); var title2 = new Paragraph("Office and the UK Hydrographic Office", titlefont); var title1and2 = new Paragraph(20, "Reproduced from Admiralty digital Notices to Mariners by permission of the Controller of Her Majesty’s Stationery Office and the UK Hydrographic Office", titlefont); var title3 = new Paragraph("HO " + UKHOVARLicenceNumber + " © British Crown Copyright " + UKHOCopyRightYear, titlefont); title1.Alignment = Element.ALIGN_RIGHT; title2.Alignment = Element.ALIGN_RIGHT; title1and2.Alignment = Element.ALIGN_RIGHT; title3.Alignment = Element.ALIGN_RIGHT; title1.IndentationRight = 15; title2.IndentationRight = 15; title1and2.IndentationRight = 15; title3.IndentationRight = 15; PdfReaderContentParser parser = new PdfReaderContentParser(reader); MyImageRenderListener listener = new MyImageRenderListener(); parser.ProcessContent(1, listener); var imgWidth = listener.ImgWidth; var imgHeight = listener.ImgHeight; var ctmWidth = listener.CtmWidth; var ctmHeight = listener.CtmHeight; var xlocation = listener.Xlocation; var ylocation = listener.Ylocation; switch (pageRotation) { case 0: document.Add(title1); document.Add(title2); document.Add(title3); writer.DirectContent.AddTemplate(importedPage, 1f, 0, 0, 1f, 15 - xlocation, -ylocation + 15); break; case 90: document.Add(title1and2); document.Add(title3); //Remember this page is on its side, so values are not always easy to keep track of // 20-ylocation // This is affecting the left hand side of the page, moving the image on the X Axis // We moved the image as far to the left as possible, but leave a margin of 20 // pageHeight-(pageHeight-xlocation)+5 // In order for the image to be in its original location, this value should be 595 == pageHeight // We want to move the image to the bottom of the page, on the Y Axis so: // We calculate the distance between the bottom of the image and the bottom of the page == (pageHeight-xlocation) // We then move the image down by this amount, leaving an addition 5 for margin writer.DirectContent.AddTemplate(importedPage, 0, -1f, 1f, 0, 15 - ylocation, pageHeight - (pageHeight - xlocation) + 15); break; default: throw new InvalidOperationException(string.Format("Unexpected page rotation: [{0}].", pageRotation)); } writer.CloseStream = false; document.Close(); Console.WriteLine(pageRotation + "\t" + imgWidth + "\t" + imgHeight + "\t" + xlocation + "\t" + ylocation); } } outputStream.Position = 0; return(outputStream); }
public PageRangePdf(string src, SourceFileTypeEnum type) { this.FirstPage = -1; this.LastPage = -1; this.Rotation = ROTATE_ENUM.NONE; if (type == SourceFileTypeEnum.Cover || type == SourceFileTypeEnum.InsideCv) { using (PdfReader reader = new PdfReader(src)) { this.TotalPages = reader.NumberOfPages; if (this.TotalPages == 1) { this.FirstPage = 1; this.LastPage = 1; } else if (this.TotalPages > 1) { this.FirstPage = 1; this.LastPage = this.TotalPages; } else { this.TotalPages = -1; } } } else if (type == SourceFileTypeEnum.Combined_Pdf || type == SourceFileTypeEnum.Combined_Pdf_No_FOs) { using (PdfReader reader = new PdfReader(src)) { TotalPages = reader.NumberOfPages; FirstPage = null; LastPage = null; } } else { using (PdfReader reader = new PdfReader(src)) { this.TotalPages = reader.NumberOfPages; this.Pages = new int[this.TotalPages + 1]; PdfReaderContentParser parser = new PdfReaderContentParser(reader); for (int i = 1; i <= reader.NumberOfPages; i++) { SimpleTextExtractionStrategy extract = new SimpleTextExtractionStrategy(); var extractedText = parser.ProcessContent(i, extract); string textFromPage = extractedText.GetResultantText(); // here, check for blank page: means it's a divider page if (System.Text.RegularExpressions.Regex.Matches(textFromPage, @"\S").Count == 0) { this.Pages[i] = -2; // -2 indicates blank page } else { int posNewLine = textFromPage.IndexOf('\n'); string strPageNum = ""; string firstLine = ""; int j = 0; while (strPageNum.Equals("") && Pages[i] == 0) { // test for classic page number if (j == 0) { firstLine = textFromPage.Substring(0, posNewLine); strPageNum = new String(firstLine.Where(Char.IsDigit).ToArray()); } // test for roman numeral else if (j == 1) { firstLine = textFromPage.Substring(0, posNewLine); char[] removeNewlineAndSpace = firstLine.Replace(" ", "").Replace("\n", "").ToArray(); int n = Roman_Parse(removeNewlineAndSpace); if (n != 0) { Pages[i] = n; } } // search for App. on page else if (j == 2) { var matches = System.Text.RegularExpressions.Regex.Matches(textFromPage, @"(App.?)( *)(\d+)"); if (matches.Count > 0) { strPageNum = matches[0].Groups[3].Value; } } // test alternative foldout numbering style else if (j == 3) { var matches = System.Text.RegularExpressions.Regex.Matches(textFromPage, @"(\d+)([Aa])"); if (matches.Count > 0) { strPageNum = matches[0].Groups[1].Value; } } else if (j == 4) { if (type == SourceFileTypeEnum.App_Foldout || type == SourceFileTypeEnum.App_ZFold || type == SourceFileTypeEnum.Brief_Foldout || type == SourceFileTypeEnum.Brief_ZFold) { while (!textFromPage.Equals("") && !Char.IsDigit(textFromPage[0])) { textFromPage = textFromPage.Substring(1, textFromPage.Length - 1); } string digits = String.Empty; int k = 0; while (!textFromPage.Equals("") && Char.IsDigit(textFromPage[k])) { digits = digits + textFromPage[k++]; } strPageNum = new String(digits.Where(Char.IsDigit).ToArray()); } } else { break; } j++; } // end while if (Pages[i] == 0) { int intPageNum; if (int.TryParse(strPageNum, out intPageNum)) { Pages[i] = intPageNum; } } // end parse number // GET LOCATION OF FIRST LINE FOR FOLDOUTS // THIS WILL GIVE US ROTATION THAT IS NEEDED TO GET PAGE NUMBER ON TOP if (type == SourceFileTypeEnum.App_Foldout || type == SourceFileTypeEnum.App_ZFold || type == SourceFileTypeEnum.Brief_Foldout || type == SourceFileTypeEnum.Brief_ZFold) { // attempt to get location, if foldout, of number found MyLocationTextExtractionStrategy extract_loc = new MyLocationTextExtractionStrategy(); var extractedText_loc = parser.ProcessContent(i, extract_loc); string textFromPage_loc = extractedText_loc.GetResultantText(); var ex = PdfTextExtractor.GetTextFromPage(reader, 1, extract_loc); float llx = float.NaN; float urx = float.NaN; float ury = float.NaN; float lly = float.NaN; foreach (var p in extract_loc.myPoints) { var a = p.Text; if (this.Pages[i] > 0 && a.Contains(this.Pages[i].ToString())) { llx = p.Rect.Left; lly = p.Rect.Bottom; ury = p.Rect.Top; urx = p.Rect.Right; } } // get page dimensions var page_size = reader.GetPageSize(i); var page_width = page_size.Width; var page_height = page_size.Height; // find which side if (page_height > page_width) { float mid_point = page_width / 2; if (llx < mid_point && lly < mid_point && urx < mid_point && ury < mid_point) { this.Rotation = ROTATE_ENUM.CLOCKWISE; } else if (llx > mid_point && lly > mid_point && urx > mid_point && ury > mid_point) { this.Rotation = ROTATE_ENUM.COUNTERCLOCKWISE; } else { // do nothing this.Rotation = ROTATE_ENUM.NONE; } } } } // end else } // end for loop for reader // CAPTURE FIRST AND LAST PAGE NUMBER // capture first page number (base 1) this.FirstPage = Pages[1]; // capture first page for files with first page number blank if (this.Pages[1] == 0) { // check second page if (this.TotalPages > 1 && this.Pages[2] > 0) { this.Pages[1] = this.Pages[2] - 1; } } // skip actual first page, if a divider page if (this.Pages[1] == -2 && this.TotalPages > 1 && this.Pages[2] > 0) { this.FirstPage = this.Pages[2]; } else { this.FirstPage = this.Pages[1]; } this.LastPage = Pages[TotalPages]; } // end using statement } }
private void ExtractPDFImages(object inputFileObject) { string inputFile = inputFileObject as string; PdfReader reader = new PdfReader(inputFile); PdfReaderContentParser parser = new PdfReaderContentParser(reader); MyImageRenderListener listener = new MyImageRenderListener(outputFolderLabel.Text, this.statusListView, this); for (int i = 1; i <= reader.NumberOfPages; i++) { //string outputStatus = string.Format("Extracting images from page {0:0000}", i); //this.statusListView.Invoke(new AddStatusItemDelegate(AddStatusItem), new object[] { outputStatus, string.Empty }); parser.ProcessContent(i, listener); int progress = (int)Math.Ceiling((float)i / (float)reader.NumberOfPages * 100f); progressBarBook.Invoke(new SetInt(SetBookProgress), progress); } }
public void TestInlineImageWithUnsupportedDecodeFilter() { PdfReader reader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "inlineImages01.pdf"); PdfReaderContentParser parser = new PdfReaderContentParser(reader); int page = reader.NumberOfPages; LocationTextExtractionStrategy strategy = parser.ProcessContent(page, new LocationTextExtractionStrategy()); }
public bool ReadPdf(string pdfFile, ref Documents doc, ref int pages) { bool success = false; try { if (pdfFile.ToLower().Contains("pdf")) { StringBuilder textBuilder = new StringBuilder(); PdfReader r = new PdfReader(pdfFile); pages = r.NumberOfPages; for (int i = 1; i <= pages; i++) { PdfReaderContentParser parser = new PdfReaderContentParser(r); ITextExtractionStrategy st = parser.ProcessContent <SimpleTextExtractionStrategy>(i, new SimpleTextExtractionStrategy()); string text = st.GetResultantText().Trim('\r', '\n', '\t', (char)32, (char)160); if (!string.IsNullOrEmpty(text)) { doc.DocBodyDic.Add(i, text); } else { text = PdfTextExtractor.GetTextFromPage(r, i).Trim('\r', '\n', '\t', (char)32, (char)160); if (!string.IsNullOrEmpty(text)) { doc.DocBodyDic.Add(i, text); } } } r.Close(); success = true; } else if (pdfFile.ToLower().Contains("doc")) { MsWord.Application newApp = null; MsWord.Document msdoc = null; try { int retry = 2; while (retry > 0) { try { //newApp = (MsWord.Application)Marshal.GetActiveObject("Word.Application"); newApp = newApp == null ? new MsWord.Application() : newApp; System.Threading.Thread.Sleep(1000); //msdoc = newApp.ActiveDocument; msdoc = newApp.Documents.Open(pdfFile); System.Threading.Thread.Sleep(1000); object nothing = Missing.Value; MsWord.WdStatistic stat = MsWord.WdStatistic.wdStatisticPages; int num = msdoc.ComputeStatistics(stat, ref nothing); for (int i = 1; i <= num; i++) { if (doc.DocBodyDic.ContainsKey(i)) { continue; } object objWhat = MsWord.WdGoToItem.wdGoToPage; object objWhich = MsWord.WdGoToDirection.wdGoToAbsolute; object objPage = (object)i; MsWord.Range range1 = msdoc.GoTo(ref objWhat, ref objWhich, ref objPage, ref nothing); MsWord.Range range2 = range1.GoToNext(MsWord.WdGoToItem.wdGoToPage); object objStart = range1.Start; object objEnd = range2.Start; if (range1.Start == range2.Start) { objEnd = msdoc.Characters.Count; } Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("DEBUG: Path: {0}, {1}-{2}........", pdfFile, objStart, objEnd); Console.ResetColor(); if ((int)objStart <= (int)objEnd) { string innerText = msdoc.Range(ref objStart, ref objEnd).Text; doc.DocBodyDic.Add(i, innerText); } } success = true; break; } catch (Exception ex) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("Retry to read word {0}, Exception: {1}..", pdfFile, ex.ToString()); Console.ResetColor(); System.Threading.Thread.Sleep(1000); retry--; } finally { if (newApp != null) { newApp.NormalTemplate.Saved = true; if (msdoc != null) { msdoc.Close(false); } newApp.Quit(); } } } } catch (Exception e) { } } } catch (Exception ex) { } return(success); }
/*private void convertJpegToPDFUsingItextSharp(object obj) * { * throw new NotImplementedException(); * }*/ private void pdftoimage(string pdfPath, int start, int len) //acrobat pdf->image { int width = 595; // pdfPage.GetSize().X*ral; int heigh = 842; // pdfPage.GetSize().Y*ral; int offset = 0; tem = new Bitmap(width, 4 * heigh);// pdfPage.GetSize().Y); Graphics g = Graphics.FromImage(tem); PdfReader pdfReader = new PdfReader(pdfName); PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader); for (int i = start; i < start + 2; i++) //从1开始 { MyImageRenderListener listener = new MyImageRenderListener(); parser.ProcessContent(i, listener); using (MemoryStream ms = new MemoryStream(listener.Images[0])) { Bitmap a = new Bitmap(842, 1190);//新图片 1240,1754 a = new Bitmap(ms); a.Save(@"C:\Users\Administrator\Desktop\桌面老师的文档\1.png"); //双页1 Bitmap b = a.Clone(new Rectangle(0, 0, a.Width, a.Height / 2), System.Drawing.Imaging.PixelFormat.Format24bppRgb); //pdf b.RotateFlip(RotateFlipType.Rotate270FlipNone); g.DrawImage(b, new Rectangle(0, offset, width, heigh)); offset += heigh; b.Dispose(); //双页2 b = a.Clone(new Rectangle(0, a.Height / 2, a.Width, a.Height / 2), System.Drawing.Imaging.PixelFormat.Format24bppRgb); b.RotateFlip(RotateFlipType.Rotate270FlipNone); g.DrawImage(b, new Rectangle(0, offset, width, heigh)); offset += heigh; a.Dispose(); b.Dispose(); } } tem1 = new Bitmap(tem, tem.Width, tem.Height); //tem1 ---------------------------------- //for (int i = 0; i < listener.Images.Count; ++i) //从0开始 //{ // //using (FileStream fos = new FileStream(@"C:\Users\Administrator\Desktop\test\result2\1\1\" + i + ".png", FileMode.Create, FileAccess.Write)) // //{ // // fos.Write(listener.Images[1], 0, listener.Images[0].Length); // //} // // //write 图片字节 //} //write 流 itextsharp listener write //Acrobat.CAcroPDDoc pdfDoc = null; //Acrobat.CAcroPDPage pdfPage = null; //Acrobat.CAcroRect pdfRect = null; //Acrobat.CAcroPoint pdfPoint = null; //pdfDoc = (Acrobat.CAcroPDDoc)Microsoft.VisualBasic.Interaction.CreateObject("AcroExch.PDDoc", ""); //pdfRect = (Acrobat.CAcroRect)Microsoft.VisualBasic.Interaction.CreateObject("AcroExch.Rect", ""); //pdfDoc.Open(pdfPath); //int StuPage = pdfDoc.GetNumPages(); ////分页pdf //pdfPage = (Acrobat.CAcroPDPage)pdfDoc.AcquirePage(0); //int ral = 1; //int width = 595;// pdfPage.GetSize().X*ral; //int heigh = 842;// pdfPage.GetSize().Y*ral; ////tem = new Bitmap(width, 2 * heigh);// pdfPage.GetSize().Y); ////tem = new Bitmap(width,heigh*4); //tem = new Bitmap(pdfPage.GetSize().X * ral / 2, 4 * pdfPage.GetSize().Y * ral); //picbox.h =image.h //显示几张图片 //Graphics g = Graphics.FromImage(tem); //int offset = 0; //for (int i = start; i < start + len; i++) //{ // pdfPage = (Acrobat.CAcroPDPage)pdfDoc.AcquirePage(i); // pdfPoint = (Acrobat.CAcroPoint)pdfPage.GetSize(); // pdfRect.Left = 0; pdfRect.Top = 0; // //pdfRect.right =(short)width; pdfRect.bottom = (short)heigh; // pdfRect.right = (short)(ral * pdfPage.GetSize().X); pdfRect.bottom = (short)(ral * pdfPage.GetSize().Y); // //pdfRect.right = 1240; pdfRect.bottom = 1754; // Clipboard.Clear(); // pdfPage.CopyToClipboard(pdfRect, 0, 0, (short)(100 * ral));// Rect:单页是否裁剪 // IDataObject clipboardData = Clipboard.GetDataObject(); //acrobat pdf to img // //双页pdf----- // if (clipboardData.GetDataPresent(DataFormats.Bitmap)) // { // Bitmap a = (Bitmap)clipboardData.GetData(DataFormats.Bitmap); // //双页1 // Bitmap b = a.Clone(new Rectangle(0, 0, pdfRect.right / 2, pdfRect.bottom), System.Drawing.Imaging.PixelFormat.Format24bppRgb); // g.DrawImage(b, new Rectangle(0, offset, width, heigh)); // offset += heigh; // //双页2 // b = a.Clone(new Rectangle(pdfRect.right / 2, 0, pdfRect.right / 2, pdfRect.bottom), System.Drawing.Imaging.PixelFormat.Format24bppRgb); // g.DrawImage(b, new Rectangle(0, offset, width, heigh)); // offset += heigh; // b.Dispose(); // a.Dispose(); // } // //----- // //单页pdf----- // /* // if (clipboardData.GetDataPresent(DataFormats.Bitmap)) // { // Bitmap a = (Bitmap)clipboardData.GetData(DataFormats.Bitmap); // Bitmap b = a.Clone(new Rectangle(0, 0, pdfPage.GetSize().X, pdfPage.GetSize().Y), System.Drawing.Imaging.PixelFormat.Format24bppRgb); // g.DrawImage(b, new Rectangle(0, 0, width, heigh)); // b.Dispose(); // a.Dispose(); // } // //----- // */ //单页 // tem = Image.FromFile(@"D:\zxs\test\1\temp\1.png"); // Clipboard.Clear(); //} //Acrobat pdf }
/// <summary>Extracts all images (of types that iTextSharp knows how to decode) /// from a specified page of a PDF file.</summary> /// <returns>Returns a generic <see cref="Dictionary<string, System.Drawing.Image>"/>, /// where the key is a suggested file name, in the format: PDF filename without extension, /// page number and image index in the page.</returns> public static Dictionary<string, System.Drawing.Image> ExtractImages(string filename, int pageNumber) { Dictionary<string, System.Drawing.Image> images = new Dictionary<string, System.Drawing.Image>(); PdfReader reader = new PdfReader(filename); PdfReaderContentParser parser = new PdfReaderContentParser(reader); ImageRenderListener listener = null; parser.ProcessContent(pageNumber, (listener = new ImageRenderListener())); int index = 1; if (listener.Images.Count > 0) { Console.WriteLine("Found {0} images on page {1}.", listener.Images.Count, pageNumber); foreach (KeyValuePair<System.Drawing.Image, string> pair in listener.Images) { images.Add(string.Format("{0}_Page_{1}_Image_{2}{3}", System.IO.Path.GetFileNameWithoutExtension(filename), pageNumber.ToString("D4"), index.ToString("D4"), pair.Value), pair.Key); index++; } } return images; }
public void Open(string fileName) { reader = new PdfReader(fileName); parser = new PdfReaderContentParser(reader); listener = new MyImageRenderListener(); }
public String ExportData() { //Document variables DocInfo docInfo = new DocInfo(); System.Boolean hasOfficialUse = false; string officialText; try { if (!ExportFilePath.isFilePathOK(".txt")) { return("Invalid export file path: " + ExportFilePath); } BeforeProcessing(); using (var pdfReader = new PdfReader(PdfPath)) { // For image checking var parser = new PdfReaderContentParser(pdfReader); ImageRenderListener listener = null; // Check to see if doc has "for official use only" at the bottom ITextExtractionStrategy officialTextRectangle = MakeRectangle(70, 1, 375, 120); officialText = PdfTextExtractor.GetTextFromPage(pdfReader, 1, officialTextRectangle); officialText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(officialText))); if (officialText.ToString().ToUpper().Contains("FOROFFICIALUSEONLY")) { hasOfficialUse = true; } else { hasOfficialUse = false; } // Loop through each page of the PDF for (Int32 currentPage = 1; currentPage <= pdfReader.NumberOfPages; currentPage++) { PageInfo currentPageInfo = new PageInfo() { PageNum = currentPage }; ITextExtractionStrategy rectangleStrategy; float height = pdfReader.GetPageSize(currentPage).Height; float width = pdfReader.GetPageSize(currentPage).Width; if (height > 785 && height < 802 && width > 1215 && width < 1230) { rectangleStrategy = MakeRectangle(450, 1, 450, 70); } else if (height > 785 && height < 802 && width > 608 && width < 617) { rectangleStrategy = MakeRectangle(190, 1, 255, 74); } else { myLogger.Log("Page # " + currentPage.ToString() + " not 8.5 x 11 or 11 x 17"); continue; } string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, rectangleStrategy); currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); if (hasOfficialUse) { currentText = OfficialUseRegex.Replace(currentText, "").Trim(); } ITextExtractionStrategy workPackageIndexStrategy = MakeRectangle(60, 600, 160, 50); string WPI = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, workPackageIndexStrategy); WPI = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(WPI))); if (WPI.ToUpper().Contains("WORKPACKAGEINDEX")) { currentPageInfo.HasWpIndex = true; } // #-# if (NumDashNumRegex.IsMatch(currentText)) { currentPageInfo.PageNumText = NumDashNumRegex.Match(currentText).Value.Trim(); currentPageInfo.IsWP = true; } else { // #-#/blank if (NumDashNumBlankRegex.IsMatch(currentText)) { currentPageInfo.PageNumText = NumDashNumBlankRegex.Match(currentText).Value.Trim(); currentPageInfo.IsDashBlank = true; currentPageInfo.IsWP = true; } else { if (romanNumRegex.IsMatch(currentText.ToUpper().Trim())) { currentPageInfo.PageNumText = romanNumRegex.Match(currentText.ToUpper().Trim()).Value.Trim(); if (String.Equals(currentPageInfo.PageNumText.ToUpper(), "C") || String.Equals(currentPageInfo.PageNumText.ToUpper(), "D")) { currentPageInfo.PageNumText = currentPageInfo.PageNumText.ToLower(); currentPageInfo.IsLetter = true; } else { currentPageInfo.IsRoman = true; } } else { if (LetterRegex.IsMatch(currentText.Trim())) { currentPageInfo.PageNumText = LetterRegex.Match(currentText).Value.Trim(); currentPageInfo.IsLetter = true; } else { // Check if whole page is empty parser.ProcessContent(currentPage, (listener = new ImageRenderListener())); ITextExtractionStrategy currentTextRectangle = MakeRectangle(1, 1, 1000000, 1000000); String checkText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, currentTextRectangle); checkText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(checkText))); if ((listener.Images.Count <= 0) && String.IsNullOrWhiteSpace(checkText)) { currentPageInfo.IsWholePageEmpty = true; currentPageInfo.IsPageNumAreaBlank = true; } else { if (String.IsNullOrWhiteSpace(currentText)) { currentPageInfo.IsPageNumAreaBlank = true; } else { if (indexRegex.IsMatch(currentText.Trim())) { currentPageInfo.PageNumText = indexRegex.Match(currentText).Value.Trim(); currentPageInfo.IsIndex = true; } else { currentPageInfo.PageNumText = currentText; currentPageInfo.IsMisc = true; } } } } } } } if (Bw.CancellationPending) { myLogger.Log("Processing cancelled at dwg #: " + currentPage.ToString()); break; } Bw.ReportProgress(Utils.GetPercentage(currentPage, pdfReader.NumberOfPages)); docInfo.Pages.Add(currentPageInfo); } } WriteDocInfoToTextFile(docInfo); } catch (System.Exception se) { return(se.Message); } finally { AfterProcessing(); } return(String.Concat(docInfo.ToString(), Environment.NewLine, "Processing completed in ", timer.Elapsed.TotalSeconds.PrintTimeFromSeconds(), Environment.NewLine, myLogger.ErrorCount.ToString(), " errors found.")); //return String.Concat( // docInfo.NumSheets, // "Processing completed in ", // timer.Elapsed.TotalSeconds.PrintTimeFromSeconds(), // " with ", // myLogger.ErrorCount, // " errors."); }
public Dictionary<System.Drawing.Image, string> Extraer_Imagenes() { Dictionary<System.Drawing.Image, string> listaImagenes = new Dictionary<System.Drawing.Image, string>(); if(miPDF != null) { PdfReaderContentParser chekeadorPDF = new PdfReaderContentParser(miPDF.PdfLeido); ImageRenderListener validadorImagenes = null; for (byte i = 1; i <= miPDF.PdfLeido.NumberOfPages; i++) { chekeadorPDF.ProcessContent(i, (validadorImagenes = new ImageRenderListener())); if (validadorImagenes.Imagenes.Count > 0) { foreach (var imagenSeleccionada in validadorImagenes.Imagenes) { listaImagenes.Add(imagenSeleccionada.Key, imagenSeleccionada.Value); //Lanzamos el Evento de Nueva Imagen Leida GetImagesPdfEventArgs NewImageEvent = new GetImagesPdfEventArgs(i); NewImageRead(this, NewImageEvent); } } } return listaImagenes; } return null; }
public override void Open() { ProgressPercentage = 0; try { PDFReaderObj = new iTextSharp.text.pdf.PdfReader(FileName); PDFParserObj = new PdfReaderContentParser(PDFReaderObj); } catch (Exception) { throw new UnsupportedFileFormatException(); } }
/** * Extract text from a specified page using an extraction strategy. * @param reader the reader to extract text from * @param pageNumber the page to extract text from * @param strategy the strategy to use for extracting text * @return the extracted text * @throws IOException if any operation fails while reading from the provided PdfReader * @since 5.0.2 */ public static String GetTextFromPage(PdfReader reader, int pageNumber, ITextExtractionStrategy strategy) { PdfReaderContentParser parser = new PdfReaderContentParser(reader); return parser.ProcessContent(pageNumber, strategy).GetResultantText(); }
public bool Read(string inputFileorFolder, string outputFolder, List <byte[]> imageBytes, List <string> imageNames, ContractParameters settings, ProgressDelegate progress) { PdfReader reader = null; PDFImageListener listener = null; try { reader = new PdfReader(inputFileorFolder); PdfReaderContentParser parser = new PdfReaderContentParser(reader); listener = new PDFImageListener(); for (int i = 1; i <= reader.NumberOfPages; i++) { listener.PageIndex = i; parser.ProcessContent(i, listener); } if (settings.CheckResult && reader.NumberOfPages != listener.ImageNames.Count) { if (settings.JoinImages) { string msg = CultureManager.Instance.GetLocalization("ByCode", "Convert.ImageCountingKO", "Extracting {0} : {1} images for {2} pages - Try to merge !"); progress(string.Format(msg, inputFileorFolder, listener.ImageNames.Count, reader.NumberOfPages)); ImageJoiner cp = new ImageJoiner(); cp.Merge(listener.ImageBytes, listener.ImageNames); msg = CultureManager.Instance.GetLocalization("ByCode", "Convert.ImageMerge", "Merge to {0} new images..."); progress(string.Format(msg, cp.NewImageNames.Count)); imageBytes.AddRange(cp.NewImageBytes); imageNames.AddRange(cp.NewImageNames); } else { string msg = CultureManager.Instance.GetLocalization("ByCode", "Convert.ImageError", "Error extracting {0} : {1} images for {2} pages !!"); progress(string.Format(msg, inputFileorFolder, listener.ImageNames.Count, reader.NumberOfPages)); throw new Exception("PDF check error"); } } else { string msg = CultureManager.Instance.GetLocalization("ByCode", "Convert.ImageCountingOK", "Extracting {0} images in {1} pages"); progress(string.Format(msg, listener.ImageNames.Count, reader.NumberOfPages)); imageBytes.AddRange(listener.ImageBytes); imageNames.AddRange(listener.ImageNames); msg = CultureManager.Instance.GetLocalization("ByCode", "Convert.ImageExtracted", "{0} images extracted..."); progress(string.Format(msg, listener.ImageBytes.Count)); } } catch (Exception err) { LogHelper.Manage("PDFImageReader:Read", err); settings.Result = false; listener.ImageNames.Clear(); listener.ImageBytes.Clear(); return(false); } finally { if (reader != null) { reader.Close(); } } return(true); }
static int Main(string[] args) { #region var declaration bool verbose = false; bool datedFileNames = false; bool showRegexMatches = false; string inputFilename = ""; bool inputFileExists; string extension; bool isPDF = false; string outputDirname = ""; bool outputDirExsists = false; uint splitType = 0x0; string keySplitRegex = ""; #endregion #region arg intake // intake args and load values in to scope var result = Parser.Default.ParseArguments <Options>(args); result.WithParsed <Options>(o => { inputFilename = o.inputFilename; outputDirname = o.outputDirname; if (o.verbose) { verbose = true; } if (o.datedFileNames) { datedFileNames = true; } if (o.showRegexMatches) { showRegexMatches = true; } if (!String.IsNullOrEmpty(o.keySplitRegex) && o.keySplitRegex.Length > 0) { splitType = splitType | 0x1; keySplitRegex = o.keySplitRegex; } }); #endregion #region verify input // input file location if (verbose) { Console.WriteLine("Input File:\t" + inputFilename); } inputFileExists = File.Exists(inputFilename); if (verbose) { Console.WriteLine(inputFileExists ? "File exists:\tTrue" : "File exists:\tFalse"); } if (!inputFileExists) { if (verbose) { Console.WriteLine("Input File does not exsist; Exiting with error code 1."); } #if DEBUG Console.ReadKey(); #endif return(1); } // input file format extension = System.IO.Path.GetExtension(inputFilename).ToLower(); if (verbose) { Console.WriteLine("File format:\t" + extension); } isPDF = string.Equals(extension, ".pdf"); if (verbose) { Console.WriteLine(isPDF ? "Correct Format:\tTrue" : "Correct Format:\tFalse"); } if (!isPDF) { if (verbose) { Console.WriteLine("Input File is not a PDF; Exiting with error code 2."); } #if DEBUG Console.ReadKey(); #endif return(2); } // output directory exsistance if (verbose) { Console.WriteLine("Output to:\t" + outputDirname); } outputDirExsists = Directory.Exists(outputDirname); if (verbose) { Console.WriteLine(outputDirExsists ? "Output valid:\tTrue" : "Output valid:\tFalse"); } if (!outputDirExsists) { if (verbose) { Console.WriteLine("Output dir does not exsist; Exiting with error code 3."); } #if DEBUG Console.ReadKey(); #endif return(3); } #endregion // Split switch (splitType) { case 0x1: // key match if (verbose) { Console.WriteLine("split type:\tKey"); } if (verbose) { Console.WriteLine("Key regex:\t" + keySplitRegex); } Regex regex = new Regex(keySplitRegex, RegexOptions.Compiled | RegexOptions.Multiline); PdfReader reader = new PdfReader(inputFilename); PdfReaderContentParser parser = new PdfReaderContentParser(reader); string regexKeyMatch = ""; int docPageStart = 1; string newDocName = ""; for (int page = 1; page <= reader.NumberOfPages; page++) { if (showRegexMatches) { Console.WriteLine("Page: " + page); } ITextExtractionStrategy strategy = parser.ProcessContent (page, new SimpleTextExtractionStrategy()); int matchCount = 0; Match match = regex.Match(strategy.GetResultantText()); { if (showRegexMatches) { Console.WriteLine("Match: " + (++matchCount)); } for (int x = 1; x <= 2; x++) { Group group = match.Groups[x]; if (showRegexMatches) { Console.WriteLine("Group " + x + " = '" + group + "'"); } CaptureCollection cc = group.Captures; for (int y = 0; y < cc.Count; y++) { Capture capture = cc[y]; string captureS = capture.ToString(); if (!string.Equals(captureS, regexKeyMatch)) { // if not first instance print last doc if (page > 1) { ExtractPages(inputFilename, outputDirname + newDocName, docPageStart, (page - 1)); } // reset the count regexKeyMatch = captureS; if (datedFileNames) { newDocName = DateTime.Now.ToString("yyyyMMdd") + "_" + captureS + ".pdf"; } else { newDocName = captureS + ".pdf"; } docPageStart = page; if (verbose) { System.Console.WriteLine("New document at page:\t" + docPageStart); } } if (showRegexMatches) { System.Console.WriteLine("Capture " + y + " = '" + capture + "', Position=" + capture.Index); } } } match = match.NextMatch(); } } break; default: if (verbose) { Console.WriteLine("No valid split type selected; Exiting with error code 4."); } #if DEBUG Console.ReadKey(); #endif return(4); } #if DEBUG Console.ReadKey(); #endif return(0); }
public override void Close() { try { if (PDFReaderObj != null) PDFReaderObj.Close(); } catch (Exception) { } PDFParserObj = null; PDFReaderObj = null; }