Example #1
0
        /// <summary>Extracts all images (of types that iTextSharp knows how to decode) from a PDF file.</summary>
        public static Dictionary<string, System.Drawing.Image> ExtractImages(string filename)
        {
            var images = new Dictionary<string, System.Drawing.Image>();

            using (var reader = new PdfReader(filename))
            {
                var parser = new PdfReaderContentParser(reader);
                ImageRenderListener listener = null;

                for (var i = 1; i <= reader.NumberOfPages; i++)
                {
                    parser.ProcessContent(i, (listener = new ImageRenderListener()));
                    var index = 1;

                    if (listener.Images.Count > 0)
                    {
                        Console.WriteLine("Found {0} images on page {1}.", listener.Images.Count, i);

                        foreach (var pair in listener.Images)
                        {
                            images.Add(string.Format("{0}_Page_{1}_Image_{2}{3}",
                                System.IO.Path.GetFileNameWithoutExtension(filename), i.ToString("D4"), index.ToString("D4"), pair.Value), pair.Key);
                            index++;
                        }
                    }
                }
                return images;
            }
        }
 /// <summary>
 /// Parses a page of a PDF file resulting in a list of
 /// </summary>
 /// <param name="reader">a PdfReader</param>
 /// <param name="page">the page number of the page that needs to be parsed</param>
 /// <param name="header_height">the height of the top margin</param>
 /// <returns>a list of TextItem and ImageItem objects</returns>
 public List<MyItem> GetContentItems(PdfReader reader, int page, float header_height)
 {
     PdfReaderContentParser parser = new PdfReaderContentParser(reader);
     MyRenderListenerSimple myRenderListener = new MyRenderListenerSimple();
     parser.ProcessContent(page, myRenderListener);
     return myRenderListener.Items;
 }
        virtual public void TestWithMultiFilteredRenderListener() {
            PdfReader pdfReader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "test.pdf");
            PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);

            float x1, y1, x2, y2;

            MultiFilteredRenderListener listener = new MultiFilteredRenderListener();
            x1 = 122;
            x2 = 144;
            y1 = 841.9f - 151;
            y2 = 841.9f - 163;
            ITextExtractionStrategy region1Listener = listener.AttachRenderListener(
                new LocationTextExtractionStrategy(), new RegionTextRenderFilter(new Rectangle(x1, y1, x2, y2)));

            x1 = 156;
            x2 = 169;
            y1 = 841.9f - 151;
            y2 = 841.9f - 163;
            ITextExtractionStrategy region2Listener = listener.AttachRenderListener(
                new LocationTextExtractionStrategy(), new RegionTextRenderFilter(new Rectangle(x1, y1, x2, y2)));

            parser.ProcessContent(1, new GlyphRenderListener(listener));
            Assert.AreEqual("Your", region1Listener.GetResultantText());
            Assert.AreEqual("dju", region2Listener.GetResultantText());
        }
Example #4
0
        public static PdfExtractionResult PDF_ExportImage(string filename, string dirForExtractions, int divider, bool checkResult, bool joinImages)
        {
            var details = new PdfExtractionResult();

            DataAccess.Instance.g_curProgress = 0;
            evnt_UpdateCurBar();

            var imagesList = new Dictionary<PageImageIndex, Image>();          

            // Ask itextsharp to extract image
            var pdfReader = new PdfReader(filename);
            var pdfParser = new PdfReaderContentParser(pdfReader);
            var pdfListener = new PDFImageListener(dirForExtractions);

            double tem0 = divider;
            double pgc = pdfReader.NumberOfPages;
            double CurOneStep = (double)(tem0 / pgc);

            details.Pages = (int)pgc;

            for (int i = 1; i <= pgc; i++)
            {
                pdfListener.PageIndex = i;
                // itextsharp send response to listener
                pdfParser.ProcessContent(i, pdfListener);

                DataAccess.Instance.g_curProgress += CurOneStep;
                evnt_UpdateCurBar();
            }

            imagesList = pdfListener.ImagesList; 
            details.ImagesBeforeMerge = pdfListener.ImagesList.Count;
            details.ImagesAfterMerge = details.ImagesBeforeMerge;

            if (checkResult && pdfReader.NumberOfPages != details.ImagesBeforeMerge)
            {
                if (joinImages)
                {
                    ImageJoiner cp = new ImageJoiner();
                    imagesList = cp.Merge(pdfListener.ImagesList, dirForExtractions);
                }

                details.ImagesAfterMerge = imagesList.Count;

                if(pdfReader.NumberOfPages != imagesList.Count)
                {                    
                    //Directory.Delete(dirForExtractions, true);
                    //throw new Exception(string.Format("Error extracting {0} : {1} images for {2} pages", Path.GetFileName(filename), pdfListener.ImagesList.Count, pdfReader.NumberOfPages));
                }
            }

            if (pdfReader != null)
                pdfReader.Close();

            // Write images to disk (because of memory problem write directly to file now)
            //WriteImages(dirForExtractions, imagesList);

            return details;
        }
 /// <summary>
 /// Parses a page of a PDF file resulting in a list of
 /// TextItem and ImageItem objects.
 /// </summary>
 /// <param name="reader">a PdfReader</param>
 /// <param name="page">the page number of the page that needs to be parsed</param>
 /// <param name="header_height">header_height the height of the top margin</param>
 /// <returns>a list of TextItem and ImageItem objects</returns>
 public List<MyItem> GetContentItems(PdfReader reader, int page, float header_height)
 {
     PdfReaderContentParser parser = new PdfReaderContentParser(reader);
     Rectangle pageSize = reader.GetPageSize(page);
     MyRenderListener myRenderListener = new MyRenderListener(pageSize.Top - header_height);
     parser.ProcessContent(page, myRenderListener);
     return myRenderListener.Items;
 }
        public virtual void TestCharacterRenderInfos() {
            byte[] bytes = CreateSimplePdf(PageSize.LETTER.Rotate().Rotate(), "ABCD");
            //TestResourceUtils.saveBytesToFile(bytes, new File("C:/temp/out.pdf"));

            PdfReader r = new PdfReader(bytes);

            PdfReaderContentParser parser = new PdfReaderContentParser(r);
            parser.ProcessContent(FIRST_PAGE, new CharacterPositionRenderListener());
        }
Example #7
0
 /// <summary>Checks whether a specified page of a PDF file contains images.</summary>
 /// <returns>True if the page contains at least one image; false otherwise.</returns>
 public static bool PageContainsImages(string filename, int pageNumber)
 {
     using (var reader = new PdfReader(filename))
     {
         var parser = new PdfReaderContentParser(reader);
         ImageRenderListener listener = null;
         parser.ProcessContent(pageNumber, (listener = new ImageRenderListener()));
         return listener.Images.Count > 0;
     }
 }
Example #8
0
        public static void ProcessContentPage(PdfReader reader, int page, Test_iTextSharp.ITextExtractionStrategy strategy)
        {
            PdfReaderContentParser parser = new PdfReaderContentParser(reader);

            PdfDictionary pageDic = reader.GetPageN(page);
            PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);

            Test_iTextSharp.PdfContentStreamProcessor processor = new Test_iTextSharp.PdfContentStreamProcessor(strategy);
            byte[] bytes = ContentByteUtils.GetContentBytesForPage(reader, page);
            processor.ProcessContent(bytes, resourcesDic);
        }
        virtual public void Test2() {
            PdfReader pdfReader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "Sample.pdf");

            PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);
            String extractedText =
                parser.ProcessContent(1,
                    new GlyphTextRenderListener(new FilteredTextRenderListener(new LocationTextExtractionStrategy(),
                        new RegionTextRenderFilter(new Rectangle(111, 855, 136, 867))))).GetResultantText();

            Assert.AreEqual("Your ", extractedText);
        }
Example #10
0
// --------------------------------------------------------------------------- 
    public void Write(Stream stream) {
      using (ZipFile zip = new ZipFile()) {
        zip.AddFile(PREFACE, "");
        PdfReader reader = new PdfReader(PREFACE);
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        StringBuilder sb = new StringBuilder();
        for (int i = 1; i <= reader.NumberOfPages; i++) {
          sb.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i));
        }
        zip.AddEntry(RESULT, sb.ToString());
        zip.Save(stream);             
      }
    }
Example #11
0
// --------------------------------------------------------------------------- 
    public void Write(Stream stream) {
      using (ZipFile zip = new ZipFile()) {
        zip.AddFile(PREFACE, "");
        PdfReader reader = new PdfReader(PREFACE);
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        StringBuilder sb = new StringBuilder();
        ITextExtractionStrategy strategy;
        for (int i = 1; i <= reader.NumberOfPages; i++) {
          strategy = parser.ProcessContent(i, new LocationTextExtractionStrategy());
          sb.AppendLine(strategy.GetResultantText());
        }
        zip.AddEntry(RESULT, sb.ToString());
        zip.Save(stream);             
      }
    }
        virtual public void Test1() {
            PdfReader pdfReader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "test.pdf");
            PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);

            float x1, y1, x2, y2;

            x1 = 203;
            x2 = 224;
            y1 = 842 - 44;
            y2 = 842 - 93;
            String extractedText =
                parser.ProcessContent(1,
                    new GlyphTextRenderListener(new FilteredTextRenderListener(new LocationTextExtractionStrategy(),
                        new RegionTextRenderFilter(new Rectangle(x1, y1, x2, y2))))).GetResultantText();
            Assert.AreEqual("1234\nt5678", extractedText);
        }
Example #13
0
// ===========================================================================
    public void Write(Stream stream) {
      ImageTypes it = new ImageTypes();
      using (ZipFile zip = new ZipFile()) {
        byte[] pdf = it.CreatePdf();
        zip.AddEntry(Utility.ResultFileName(it.ToString() + ".pdf"), pdf);
        PdfReader reader = new PdfReader(pdf);
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        MyImageRenderListener listener = new MyImageRenderListener();
        for (int i = 1; i <= reader.NumberOfPages; i++) {
          parser.ProcessContent(i, listener);
        } 
        for (int i = 0; i < listener.MyImages.Count; ++i) {
          zip.AddEntry(
            listener.ImageNames[i],
            listener.MyImages[i]
          );
        }         
        zip.Save(stream);
      }
    }
        private void ParseAndHighlight(String input, String output, bool singleCharacters) {
            PdfReader reader = new PdfReader(input);
            FileStream fos = new FileStream(output, FileMode.Create);
            PdfStamper stamper = new PdfStamper(reader, fos);

            PdfReaderContentParser parser = new PdfReaderContentParser(reader);
            MyRenderListener myRenderListener = singleCharacters ? new MyCharacterRenderListener() : new MyRenderListener();
            for (int pageNum = 1; pageNum <= reader.NumberOfPages; pageNum++) {
                List<Rectangle> rectangles = parser.ProcessContent(pageNum, myRenderListener).GetRectangles();
                PdfContentByte canvas = stamper.GetOverContent(pageNum);
                canvas.SetLineWidth(0.5f);
                canvas.SetColorStroke(BaseColor.RED);
                foreach (Rectangle rectangle in rectangles) {
                    canvas.Rectangle(rectangle.Left, rectangle.Bottom, rectangle.Width, rectangle.Height);
                    canvas.Stroke();
                }
            }
            stamper.Close();
            fos.Close();
            reader.Close();
        }
Example #15
0
// --------------------------------------------------------------------------- 
    public void Write(Stream stream) {
      using (ZipFile zip = new ZipFile()) {
        zip.AddFile(PREFACE, "");
        PdfReader reader = new PdfReader(PREFACE);
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        using (MemoryStream ms = new MemoryStream()) {
          using (PdfStamper stamper = new PdfStamper(reader, ms)) {
            TextMarginFinder finder;
            for (int i = 1; i <= reader.NumberOfPages; i++) {
              finder = parser.ProcessContent(i, new TextMarginFinder());
              PdfContentByte cb = stamper.GetOverContent(i);
              cb.Rectangle(
                finder.GetLlx(), finder.GetLly(),
                finder.GetWidth(), finder.GetHeight()
              );
              cb.Stroke();
            }
          }
          zip.AddEntry(RESULT, ms.ToArray());
        }
        zip.Save(stream);             
      }
    }
Example #16
0
        private void openFile(string fileName)
        {
            string[] fileSplit = fileName.Split('.');

            if (fileSplit[fileSplit.Length - 1] == "docx")
            {
                DocxToText dtt = new DocxToText(fileName);
                richTextBoxEditor.Text = dtt.ExtractText();
            }
            else if (fileSplit[fileSplit.Length - 1] == "doc")
            {
                MessageBox.Show("Tyvärr stödjer inte programmet det gamla wordformatet (.doc). Prova med att spara om det till det nya formatet (.docx), eller som en textfil", "Fel filformat", MessageBoxButtons.OK, MessageBoxIcon.Stop);
            }
            else if(fileSplit[fileSplit.Length - 1] == "rtf")
            {
                richTextBoxEditor.LoadFile(fileName, RichTextBoxStreamType.RichText);
            }
            else if (fileSplit[fileSplit.Length - 1] == "pdf")
            {
                richTextBoxEditor.Clear();

                PdfReader pdfread = new PdfReader(fileName);
                PdfReaderContentParser pdfparser = new PdfReaderContentParser(pdfread);
                ITextExtractionStrategy strategy;

                for (int i = 1; i <= pdfread.NumberOfPages; i++)
                {
                    strategy = pdfparser.ProcessContent(i, new SimpleTextExtractionStrategy());
                    richTextBoxEditor.Text += strategy.GetResultantText();
                }
            }
            else if (fileSplit[fileSplit.Length - 1] == "txt")
            {
                richTextBoxEditor.LoadFile(fileName, RichTextBoxStreamType.PlainText);
            }
            else
            {
                richTextBoxEditor.LoadFile(fileName, RichTextBoxStreamType.PlainText);
            }

            textToolStripMenuItem.Enabled = false;
            punktToolStripMenuItem.Enabled = false;
            textToolStripMenuItem.Checked = false;
            punktToolStripMenuItem.Checked = false;
        }
Example #17
0
        public static MemoryStream AdicionarDataHoraControleAcesso(Stream pdf, int tituloModeloCodigo)
        {
            //Ignora PASSWORD de PDFs protegidos
            PdfReader.unethicalreading = true;

            BaseColor       corTexto = BaseColor.BLACK;
            MemoryStream    ms       = new MemoryStream();
            PdfImportedPage page;
            PdfReader       reader = new PdfReader(pdf);
            Document        doc    = new Document(reader.GetPageSizeWithRotation(1));
            PdfContentByte  cb     = null;

            PdfWriter wrt = PdfWriter.GetInstance(doc, ms);

            wrt.PageEvent = null;

            doc.Open();

            PdfReaderContentParser parser = new PdfReaderContentParser(reader);

            #region Páginas do Pdf

            for (int i = 1; i <= reader.NumberOfPages; i++)
            {
                doc.SetPageSize(reader.GetPageSizeWithRotation(i));
                doc.NewPage();

                cb = wrt.DirectContentUnder;

                cb.SaveState();

                cb.SetColorFill(corTexto);
                cb.BeginText();
                cb.SetFontAndSize(arial16.BaseFont, 5);

                page = wrt.GetImportedPage(reader, i);

                float x1, y, x2;
                y = x1 = x2 = 0f;

                switch (tituloModeloCodigo)
                {
                case 19 /*Certificado de Registro de Atividade Florestal*/:
                    y  = doc.PageSize.Bottom + doc.BottomMargin * 3 - 10f;
                    x1 = doc.PageSize.Width / 4;
                    x2 = (doc.PageSize.Width / 4) * 3;

                    break;

                case 20 /*Licença de Porte e Uso de Motosserra*/:
                    y  = doc.PageSize.Bottom + doc.BottomMargin * 3 - 15f;
                    x1 = doc.PageSize.Width / 4;
                    x2 = (doc.PageSize.Width / 4) * 3 - 20f;

                    break;

                default:
                    break;
                }

                string texto = DateTime.Now.ToString("dd/M/yyyy H:mm:ss");

                switch (doc.PageSize.Rotation)
                {
                case 0:
                    cb.ShowTextAligned(Element.ALIGN_LEFT, texto, x1, y, 0);                            //Rodape
                    cb.ShowTextAligned(Element.ALIGN_LEFT, texto, x2, y, 0);                            //Rodape
                    cb.AddTemplate(page, 1f, 0, 0, 1f, 0, 0);
                    break;

                case 90:
                    cb.ShowTextAligned(Element.ALIGN_LEFT, texto, x1, y, 0);                            //Rodape
                    cb.ShowTextAligned(Element.ALIGN_LEFT, texto, x2, y, 0);                            //Rodape
                    cb.AddTemplate(page, 0, -1f, 1f, 0, 0, doc.PageSize.Height);
                    break;

                case 180:
                    cb.ShowTextAligned(Element.ALIGN_LEFT, texto, x1, y, 0);                            //Rodape
                    cb.ShowTextAligned(Element.ALIGN_LEFT, texto, x2, y, 0);                            //Rodape
                    cb.AddTemplate(page, -1f, 0, 0, -1f, doc.PageSize.Width, doc.PageSize.Height);
                    break;

                case 270:
                    cb.ShowTextAligned(Element.ALIGN_LEFT, texto, x1, y, 0);                            //Rodape
                    cb.ShowTextAligned(Element.ALIGN_LEFT, texto, x2, y, 0);                            //Rodape
                    cb.AddTemplate(page, 0, 1.0F, -1.0F, 0, doc.PageSize.Width, 0);
                    break;
                }

                cb.EndText();
                cb.RestoreState();

                cb.SaveState();
                cb.ResetGrayFill();

                cb.RestoreState();
            }

            #endregion

            doc.Close();

            MemoryStream msOut = new MemoryStream(ms.ToArray());
            ms.Close();
            ms.Dispose();

            return(msOut);
        }
Example #18
0
        public static PdfExtractionResult PDF_ExportImage(string filename, string dirForExtractions, int divider, bool checkResult, bool joinImages, bool osx)
        {
            var details = new PdfExtractionResult();

            DataAccess.Instance.g_curProgress = 0;
            evnt_UpdateCurBar();

            var imagesList = new Dictionary <PageImageIndex, Image>();

            if (osx)
            {
                dirForExtractions = dirForExtractions + "/";
            }

            // Ask itextsharp to extract image
            var pdfReader   = new PdfReader(filename);
            var pdfParser   = new PdfReaderContentParser(pdfReader);
            var pdfListener = new PDFImageListener(dirForExtractions);

            double tem0       = divider;
            double pgc        = pdfReader.NumberOfPages;
            double CurOneStep = (double)(tem0 / pgc);

            details.Pages = (int)pgc;

            for (int i = 1; i <= pgc; i++)
            {
                pdfListener.PageIndex = i;
                // itextsharp send response to listener
                pdfParser.ProcessContent(i, pdfListener);

                DataAccess.Instance.g_curProgress += CurOneStep;
                evnt_UpdateCurBar();
            }

            imagesList = pdfListener.ImagesList;
            details.ImagesBeforeMerge = pdfListener.ImagesList.Count;
            details.ImagesAfterMerge  = details.ImagesBeforeMerge;

            if (checkResult && pdfReader.NumberOfPages != details.ImagesBeforeMerge)
            {
                if (joinImages)
                {
                    ImageJoiner cp = new ImageJoiner();
                    imagesList = cp.Merge(pdfListener.ImagesList, dirForExtractions);
                }

                details.ImagesAfterMerge = imagesList.Count;

                if (pdfReader.NumberOfPages != imagesList.Count)
                {
                    //Directory.Delete(dirForExtractions, true);
                    //throw new Exception(string.Format("Error extracting {0} : {1} images for {2} pages", Path.GetFileName(filename), pdfListener.ImagesList.Count, pdfReader.NumberOfPages));
                }
            }

            if (pdfReader != null)
            {
                pdfReader.Close();
            }

            // Write images to disk (because of memory problem write directly to file now)
            //WriteImages(dirForExtractions, imagesList);

            return(details);
        }
Example #19
0
        public MemoryStream AddVARLicenceBlocks(MemoryStream inputStream)
        {
            PdfReader reader = new PdfReader((byte[])inputStream.ToArray());

            MemoryStream outputStream = new MemoryStream();

            using (Document document = new Document(reader.GetPageSizeWithRotation(1), 0, 0, 0, 0))
            {
                using (PdfWriter writer = PdfWriter.GetInstance(document, outputStream))
                {
                    document.Open();

                    PdfImportedPage importedPage = writer.GetImportedPage(reader, 1);

                    var pageRotation = reader.GetPageRotation(1);
                    var pageWidth    = reader.GetPageSizeWithRotation(1).Width;
                    var pageHeight   = reader.GetPageSizeWithRotation(1).Height;
                    var titlefont    = FontFactory.GetFont(BaseFont.COURIER, 7, Font.NORMAL);
                    var title1       = new Paragraph(20, "Reproduced from Admiralty digital Notices to Mariners by permission of the Controller of Her Majesty’s Stationery", titlefont);
                    var title2       = new Paragraph("Office and the UK Hydrographic Office", titlefont);
                    var title1and2   = new Paragraph(20, "Reproduced from Admiralty digital Notices to Mariners by permission of the Controller of Her Majesty’s Stationery Office and the UK Hydrographic Office", titlefont);
                    var title3       = new Paragraph("HO " + UKHOVARLicenceNumber + " © British Crown Copyright " + UKHOCopyRightYear, titlefont);

                    title1.Alignment            = Element.ALIGN_RIGHT;
                    title2.Alignment            = Element.ALIGN_RIGHT;
                    title1and2.Alignment        = Element.ALIGN_RIGHT;
                    title3.Alignment            = Element.ALIGN_RIGHT;
                    title1.IndentationRight     = 15;
                    title2.IndentationRight     = 15;
                    title1and2.IndentationRight = 15;
                    title3.IndentationRight     = 15;

                    PdfReaderContentParser parser   = new PdfReaderContentParser(reader);
                    MyImageRenderListener  listener = new MyImageRenderListener();
                    parser.ProcessContent(1, listener);
                    var imgWidth  = listener.ImgWidth;
                    var imgHeight = listener.ImgHeight;
                    var ctmWidth  = listener.CtmWidth;
                    var ctmHeight = listener.CtmHeight;
                    var xlocation = listener.Xlocation;
                    var ylocation = listener.Ylocation;

                    switch (pageRotation)
                    {
                    case 0:
                        document.Add(title1);
                        document.Add(title2);
                        document.Add(title3);
                        writer.DirectContent.AddTemplate(importedPage, 1f, 0, 0, 1f, 15 - xlocation, -ylocation + 15);
                        break;

                    case 90:
                        document.Add(title1and2);
                        document.Add(title3);
                        //Remember this page is on its side, so values are not always easy to keep track of
                        // 20-ylocation
                        //      This is affecting the left hand side of the page, moving the image on the X Axis
                        //      We moved the image as far to the left as possible, but leave a margin of 20
                        // pageHeight-(pageHeight-xlocation)+5
                        //      In order for the image to be in its original location, this value should be 595 == pageHeight
                        //      We want to move the image to the bottom of the page, on the Y Axis so:
                        //      We calculate the distance between the bottom of the image and the bottom of the page == (pageHeight-xlocation)
                        //      We then move the image down by this amount, leaving an addition 5 for margin
                        writer.DirectContent.AddTemplate(importedPage, 0, -1f, 1f, 0, 15 - ylocation, pageHeight - (pageHeight - xlocation) + 15);
                        break;

                    default:
                        throw new InvalidOperationException(string.Format("Unexpected page rotation: [{0}].", pageRotation));
                    }

                    writer.CloseStream = false;
                    document.Close();
                    Console.WriteLine(pageRotation + "\t" + imgWidth + "\t" + imgHeight + "\t" + xlocation + "\t" + ylocation);
                }
            }

            outputStream.Position = 0;
            return(outputStream);
        }
Example #20
0
        public PageRangePdf(string src, SourceFileTypeEnum type)
        {
            this.FirstPage = -1;
            this.LastPage  = -1;
            this.Rotation  = ROTATE_ENUM.NONE;

            if (type == SourceFileTypeEnum.Cover || type == SourceFileTypeEnum.InsideCv)
            {
                using (PdfReader reader = new PdfReader(src))
                {
                    this.TotalPages = reader.NumberOfPages;
                    if (this.TotalPages == 1)
                    {
                        this.FirstPage = 1;
                        this.LastPage  = 1;
                    }
                    else if (this.TotalPages > 1)
                    {
                        this.FirstPage = 1;
                        this.LastPage  = this.TotalPages;
                    }
                    else
                    {
                        this.TotalPages = -1;
                    }
                }
            }
            else if (type == SourceFileTypeEnum.Combined_Pdf || type == SourceFileTypeEnum.Combined_Pdf_No_FOs)
            {
                using (PdfReader reader = new PdfReader(src))
                {
                    TotalPages = reader.NumberOfPages;
                    FirstPage  = null;
                    LastPage   = null;
                }
            }
            else
            {
                using (PdfReader reader = new PdfReader(src))
                {
                    this.TotalPages = reader.NumberOfPages;
                    this.Pages      = new int[this.TotalPages + 1];

                    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
                    for (int i = 1; i <= reader.NumberOfPages; i++)
                    {
                        SimpleTextExtractionStrategy extract = new SimpleTextExtractionStrategy();
                        var    extractedText = parser.ProcessContent(i, extract);
                        string textFromPage  = extractedText.GetResultantText();

                        // here, check for blank page: means it's a divider page
                        if (System.Text.RegularExpressions.Regex.Matches(textFromPage, @"\S").Count == 0)
                        {
                            this.Pages[i] = -2; // -2 indicates blank page
                        }
                        else
                        {
                            int posNewLine = textFromPage.IndexOf('\n');

                            string strPageNum = "";
                            string firstLine  = "";
                            int    j          = 0;
                            while (strPageNum.Equals("") && Pages[i] == 0)
                            {
                                // test for classic page number
                                if (j == 0)
                                {
                                    firstLine  = textFromPage.Substring(0, posNewLine);
                                    strPageNum = new String(firstLine.Where(Char.IsDigit).ToArray());
                                }
                                // test for roman numeral
                                else if (j == 1)
                                {
                                    firstLine = textFromPage.Substring(0, posNewLine);
                                    char[] removeNewlineAndSpace = firstLine.Replace(" ", "").Replace("\n", "").ToArray();
                                    int    n = Roman_Parse(removeNewlineAndSpace);
                                    if (n != 0)
                                    {
                                        Pages[i] = n;
                                    }
                                }
                                // search for App. on page
                                else if (j == 2)
                                {
                                    var matches = System.Text.RegularExpressions.Regex.Matches(textFromPage, @"(App.?)( *)(\d+)");
                                    if (matches.Count > 0)
                                    {
                                        strPageNum = matches[0].Groups[3].Value;
                                    }
                                }
                                // test alternative foldout numbering style
                                else if (j == 3)
                                {
                                    var matches = System.Text.RegularExpressions.Regex.Matches(textFromPage, @"(\d+)([Aa])");
                                    if (matches.Count > 0)
                                    {
                                        strPageNum = matches[0].Groups[1].Value;
                                    }
                                }
                                else if (j == 4)
                                {
                                    if (type == SourceFileTypeEnum.App_Foldout || type == SourceFileTypeEnum.App_ZFold ||
                                        type == SourceFileTypeEnum.Brief_Foldout || type == SourceFileTypeEnum.Brief_ZFold)
                                    {
                                        while (!textFromPage.Equals("") && !Char.IsDigit(textFromPage[0]))
                                        {
                                            textFromPage = textFromPage.Substring(1, textFromPage.Length - 1);
                                        }
                                        string digits = String.Empty; int k = 0;
                                        while (!textFromPage.Equals("") && Char.IsDigit(textFromPage[k]))
                                        {
                                            digits = digits + textFromPage[k++];
                                        }
                                        strPageNum = new String(digits.Where(Char.IsDigit).ToArray());
                                    }
                                }
                                else
                                {
                                    break;
                                }
                                j++;
                            } // end while

                            if (Pages[i] == 0)
                            {
                                int intPageNum;
                                if (int.TryParse(strPageNum, out intPageNum))
                                {
                                    Pages[i] = intPageNum;
                                }
                            } // end parse number

                            // GET LOCATION OF FIRST LINE FOR FOLDOUTS
                            // THIS WILL GIVE US ROTATION THAT IS NEEDED TO GET PAGE NUMBER ON TOP
                            if (type == SourceFileTypeEnum.App_Foldout || type == SourceFileTypeEnum.App_ZFold ||
                                type == SourceFileTypeEnum.Brief_Foldout || type == SourceFileTypeEnum.Brief_ZFold)
                            {
                                // attempt to get location, if foldout, of number found
                                MyLocationTextExtractionStrategy extract_loc = new MyLocationTextExtractionStrategy();
                                var    extractedText_loc = parser.ProcessContent(i, extract_loc);
                                string textFromPage_loc  = extractedText_loc.GetResultantText();

                                var ex = PdfTextExtractor.GetTextFromPage(reader, 1, extract_loc);

                                float llx = float.NaN;
                                float urx = float.NaN;
                                float ury = float.NaN;
                                float lly = float.NaN;

                                foreach (var p in extract_loc.myPoints)
                                {
                                    var a = p.Text;
                                    if (this.Pages[i] > 0 && a.Contains(this.Pages[i].ToString()))
                                    {
                                        llx = p.Rect.Left;
                                        lly = p.Rect.Bottom;
                                        ury = p.Rect.Top;
                                        urx = p.Rect.Right;
                                    }
                                }

                                // get page dimensions
                                var page_size   = reader.GetPageSize(i);
                                var page_width  = page_size.Width;
                                var page_height = page_size.Height;

                                // find which side
                                if (page_height > page_width)
                                {
                                    float mid_point = page_width / 2;
                                    if (llx < mid_point && lly < mid_point && urx < mid_point && ury < mid_point)
                                    {
                                        this.Rotation = ROTATE_ENUM.CLOCKWISE;
                                    }
                                    else if (llx > mid_point && lly > mid_point && urx > mid_point && ury > mid_point)
                                    {
                                        this.Rotation = ROTATE_ENUM.COUNTERCLOCKWISE;
                                    }
                                    else
                                    {
                                        // do nothing
                                        this.Rotation = ROTATE_ENUM.NONE;
                                    }
                                }
                            }
                        } // end else
                    }     // end for loop for reader

                    // CAPTURE FIRST AND LAST PAGE NUMBER
                    // capture first page number (base 1)
                    this.FirstPage = Pages[1];

                    // capture first page for files with first page number blank
                    if (this.Pages[1] == 0)
                    {
                        // check second page
                        if (this.TotalPages > 1 && this.Pages[2] > 0)
                        {
                            this.Pages[1] = this.Pages[2] - 1;
                        }
                    }
                    // skip actual first page, if a divider page
                    if (this.Pages[1] == -2 && this.TotalPages > 1 && this.Pages[2] > 0)
                    {
                        this.FirstPage = this.Pages[2];
                    }
                    else
                    {
                        this.FirstPage = this.Pages[1];
                    }
                    this.LastPage = Pages[TotalPages];
                } // end using statement
            }
        }
Example #21
0
        private void ExtractPDFImages(object inputFileObject)
        {
            string inputFile = inputFileObject as string;

            PdfReader reader = new PdfReader(inputFile);
            PdfReaderContentParser parser = new PdfReaderContentParser(reader);
            MyImageRenderListener listener = new MyImageRenderListener(outputFolderLabel.Text, this.statusListView, this);


            for (int i = 1; i <= reader.NumberOfPages; i++)
            {
                //string outputStatus = string.Format("Extracting images from page {0:0000}", i);
                //this.statusListView.Invoke(new AddStatusItemDelegate(AddStatusItem), new object[] { outputStatus, string.Empty });
                parser.ProcessContent(i, listener);

                int progress = (int)Math.Ceiling((float)i / (float)reader.NumberOfPages * 100f);
                progressBarBook.Invoke(new SetInt(SetBookProgress), progress);
            }
        }
 public void TestInlineImageWithUnsupportedDecodeFilter() {
     PdfReader reader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "inlineImages01.pdf");
     PdfReaderContentParser parser = new PdfReaderContentParser(reader);
     int page =  reader.NumberOfPages;
     LocationTextExtractionStrategy strategy = parser.ProcessContent(page, new LocationTextExtractionStrategy());
 }
Example #23
0
        public bool ReadPdf(string pdfFile, ref Documents doc, ref int pages)
        {
            bool success = false;

            try
            {
                if (pdfFile.ToLower().Contains("pdf"))
                {
                    StringBuilder textBuilder = new StringBuilder();
                    PdfReader     r           = new PdfReader(pdfFile);
                    pages = r.NumberOfPages;

                    for (int i = 1; i <= pages; i++)
                    {
                        PdfReaderContentParser  parser = new PdfReaderContentParser(r);
                        ITextExtractionStrategy st     = parser.ProcessContent <SimpleTextExtractionStrategy>(i, new SimpleTextExtractionStrategy());
                        string text = st.GetResultantText().Trim('\r', '\n', '\t', (char)32, (char)160);

                        if (!string.IsNullOrEmpty(text))
                        {
                            doc.DocBodyDic.Add(i, text);
                        }
                        else
                        {
                            text = PdfTextExtractor.GetTextFromPage(r, i).Trim('\r', '\n', '\t', (char)32, (char)160);

                            if (!string.IsNullOrEmpty(text))
                            {
                                doc.DocBodyDic.Add(i, text);
                            }
                        }
                    }

                    r.Close();
                    success = true;
                }
                else if (pdfFile.ToLower().Contains("doc"))
                {
                    MsWord.Application newApp = null;
                    MsWord.Document    msdoc  = null;

                    try
                    {
                        int retry = 2;
                        while (retry > 0)
                        {
                            try
                            {
                                //newApp = (MsWord.Application)Marshal.GetActiveObject("Word.Application");
                                newApp = newApp == null ? new MsWord.Application() : newApp;
                                System.Threading.Thread.Sleep(1000);
                                //msdoc = newApp.ActiveDocument;
                                msdoc = newApp.Documents.Open(pdfFile);
                                System.Threading.Thread.Sleep(1000);
                                object             nothing = Missing.Value;
                                MsWord.WdStatistic stat    = MsWord.WdStatistic.wdStatisticPages;
                                int num = msdoc.ComputeStatistics(stat, ref nothing);

                                for (int i = 1; i <= num; i++)
                                {
                                    if (doc.DocBodyDic.ContainsKey(i))
                                    {
                                        continue;
                                    }

                                    object objWhat  = MsWord.WdGoToItem.wdGoToPage;
                                    object objWhich = MsWord.WdGoToDirection.wdGoToAbsolute;

                                    object       objPage = (object)i;
                                    MsWord.Range range1  = msdoc.GoTo(ref objWhat, ref objWhich, ref objPage, ref nothing);
                                    MsWord.Range range2  = range1.GoToNext(MsWord.WdGoToItem.wdGoToPage);

                                    object objStart = range1.Start;
                                    object objEnd   = range2.Start;
                                    if (range1.Start == range2.Start)
                                    {
                                        objEnd = msdoc.Characters.Count;
                                    }

                                    Console.ForegroundColor = ConsoleColor.Red;
                                    Console.WriteLine("DEBUG: Path: {0}, {1}-{2}........", pdfFile, objStart, objEnd);
                                    Console.ResetColor();

                                    if ((int)objStart <= (int)objEnd)
                                    {
                                        string innerText = msdoc.Range(ref objStart, ref objEnd).Text;
                                        doc.DocBodyDic.Add(i, innerText);
                                    }
                                }

                                success = true;
                                break;
                            }
                            catch (Exception ex)
                            {
                                Console.ForegroundColor = ConsoleColor.Red;
                                Console.WriteLine("Retry to read word {0}, Exception: {1}..", pdfFile, ex.ToString());
                                Console.ResetColor();
                                System.Threading.Thread.Sleep(1000);
                                retry--;
                            }
                            finally
                            {
                                if (newApp != null)
                                {
                                    newApp.NormalTemplate.Saved = true;

                                    if (msdoc != null)
                                    {
                                        msdoc.Close(false);
                                    }

                                    newApp.Quit();
                                }
                            }
                        }
                    }
                    catch (Exception e)
                    {
                    }
                }
            }
            catch (Exception ex)
            {
            }

            return(success);
        }
Example #24
0
        /*private void convertJpegToPDFUsingItextSharp(object obj)
         * {
         *  throw new NotImplementedException();
         * }*/

        private void pdftoimage(string pdfPath, int start, int len) //acrobat   pdf->image
        {
            int width  = 595;                                       // pdfPage.GetSize().X*ral;
            int heigh  = 842;                                       // pdfPage.GetSize().Y*ral;
            int offset = 0;


            tem = new Bitmap(width, 4 * heigh);// pdfPage.GetSize().Y);
            Graphics g = Graphics.FromImage(tem);



            PdfReader pdfReader           = new PdfReader(pdfName);
            PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);

            for (int i = start; i < start + 2; i++)        //从1开始
            {
                MyImageRenderListener listener = new MyImageRenderListener();
                parser.ProcessContent(i, listener);
                using (MemoryStream ms = new MemoryStream(listener.Images[0]))
                {
                    Bitmap a = new Bitmap(842, 1190);//新图片  1240,1754
                    a = new Bitmap(ms);


                    a.Save(@"C:\Users\Administrator\Desktop\桌面老师的文档\1.png");
                    //双页1
                    Bitmap b = a.Clone(new Rectangle(0, 0, a.Width, a.Height / 2), System.Drawing.Imaging.PixelFormat.Format24bppRgb);   //pdf
                    b.RotateFlip(RotateFlipType.Rotate270FlipNone);
                    g.DrawImage(b, new Rectangle(0, offset, width, heigh));
                    offset += heigh;

                    b.Dispose();
                    //双页2
                    b = a.Clone(new Rectangle(0, a.Height / 2, a.Width, a.Height / 2), System.Drawing.Imaging.PixelFormat.Format24bppRgb);
                    b.RotateFlip(RotateFlipType.Rotate270FlipNone);
                    g.DrawImage(b, new Rectangle(0, offset, width, heigh));
                    offset += heigh;
                    a.Dispose();
                    b.Dispose();
                }
            }
            tem1 = new Bitmap(tem, tem.Width, tem.Height);  //tem1  ----------------------------------


            //for (int i = 0; i < listener.Images.Count; ++i)   //从0开始
            //{

            //    //using (FileStream fos = new FileStream(@"C:\Users\Administrator\Desktop\test\result2\1\1\" + i + ".png", FileMode.Create, FileAccess.Write))
            //    //{
            //    //    fos.Write(listener.Images[1], 0, listener.Images[0].Length);
            //    //}
            //    //   //write 图片字节

            //} //write 流      itextsharp listener write


            //Acrobat.CAcroPDDoc pdfDoc = null;
            //Acrobat.CAcroPDPage pdfPage = null;
            //Acrobat.CAcroRect pdfRect = null;
            //Acrobat.CAcroPoint pdfPoint = null;
            //pdfDoc = (Acrobat.CAcroPDDoc)Microsoft.VisualBasic.Interaction.CreateObject("AcroExch.PDDoc", "");
            //pdfRect = (Acrobat.CAcroRect)Microsoft.VisualBasic.Interaction.CreateObject("AcroExch.Rect", "");


            //pdfDoc.Open(pdfPath);
            //int StuPage = pdfDoc.GetNumPages();

            ////分页pdf


            //pdfPage = (Acrobat.CAcroPDPage)pdfDoc.AcquirePage(0);
            //int ral = 1;
            //int width = 595;// pdfPage.GetSize().X*ral;
            //int heigh = 842;// pdfPage.GetSize().Y*ral;
            ////tem = new Bitmap(width, 2 * heigh);// pdfPage.GetSize().Y);
            ////tem = new Bitmap(width,heigh*4);
            //tem = new Bitmap(pdfPage.GetSize().X * ral / 2, 4 * pdfPage.GetSize().Y * ral);           //picbox.h =image.h  //显示几张图片
            //Graphics g = Graphics.FromImage(tem);
            //int offset = 0;

            //for (int i = start; i < start + len; i++)
            //{
            //    pdfPage = (Acrobat.CAcroPDPage)pdfDoc.AcquirePage(i);
            //    pdfPoint = (Acrobat.CAcroPoint)pdfPage.GetSize();
            //    pdfRect.Left = 0; pdfRect.Top = 0;
            //    //pdfRect.right =(short)width; pdfRect.bottom = (short)heigh;
            //    pdfRect.right = (short)(ral * pdfPage.GetSize().X); pdfRect.bottom = (short)(ral * pdfPage.GetSize().Y);
            //    //pdfRect.right = 1240; pdfRect.bottom = 1754;
            //    Clipboard.Clear();
            //    pdfPage.CopyToClipboard(pdfRect, 0, 0, (short)(100 * ral));//   Rect:单页是否裁剪
            //    IDataObject clipboardData = Clipboard.GetDataObject(); //acrobat pdf to img


            //    //双页pdf-----

            //    if (clipboardData.GetDataPresent(DataFormats.Bitmap))
            //    {
            //        Bitmap a = (Bitmap)clipboardData.GetData(DataFormats.Bitmap);

            //        //双页1
            //        Bitmap b = a.Clone(new Rectangle(0, 0, pdfRect.right / 2, pdfRect.bottom), System.Drawing.Imaging.PixelFormat.Format24bppRgb);
            //        g.DrawImage(b, new Rectangle(0, offset, width, heigh));
            //        offset += heigh;

            //        //双页2
            //        b = a.Clone(new Rectangle(pdfRect.right / 2, 0, pdfRect.right / 2, pdfRect.bottom), System.Drawing.Imaging.PixelFormat.Format24bppRgb);
            //        g.DrawImage(b, new Rectangle(0, offset, width, heigh));
            //        offset += heigh;

            //        b.Dispose();
            //        a.Dispose();

            //    }

            //    //-----


            //    //单页pdf-----
            //    /*
            //    if (clipboardData.GetDataPresent(DataFormats.Bitmap))
            //    {
            //        Bitmap a = (Bitmap)clipboardData.GetData(DataFormats.Bitmap);

            //        Bitmap b = a.Clone(new Rectangle(0, 0, pdfPage.GetSize().X, pdfPage.GetSize().Y), System.Drawing.Imaging.PixelFormat.Format24bppRgb);
            //        g.DrawImage(b, new Rectangle(0, 0, width, heigh));
            //        b.Dispose();
            //        a.Dispose();

            //    }
            //    //-----
            //    */ //单页


            //    tem = Image.FromFile(@"D:\zxs\test\1\temp\1.png");
            //    Clipboard.Clear();

            //}  //Acrobat pdf
        }
Example #25
0
        /// <summary>Extracts all images (of types that iTextSharp knows how to decode) 
        /// from a specified page of a PDF file.</summary>
        /// <returns>Returns a generic <see cref="Dictionary&lt;string, System.Drawing.Image&gt;"/>, 
        /// where the key is a suggested file name, in the format: PDF filename without extension, 
        /// page number and image index in the page.</returns>
        public static Dictionary<string, System.Drawing.Image> ExtractImages(string filename, int pageNumber)
        {
            Dictionary<string, System.Drawing.Image> images = new Dictionary<string, System.Drawing.Image>();
            PdfReader reader = new PdfReader(filename);
            PdfReaderContentParser parser = new PdfReaderContentParser(reader);
            ImageRenderListener listener = null;

            parser.ProcessContent(pageNumber, (listener = new ImageRenderListener()));
            int index = 1;

            if (listener.Images.Count > 0)
            {
                Console.WriteLine("Found {0} images on page {1}.", listener.Images.Count, pageNumber);

                foreach (KeyValuePair<System.Drawing.Image, string> pair in listener.Images)
                {
                    images.Add(string.Format("{0}_Page_{1}_Image_{2}{3}",
                        System.IO.Path.GetFileNameWithoutExtension(filename), pageNumber.ToString("D4"), index.ToString("D4"), pair.Value), pair.Key);
                    index++;
                }
            }
            return images;
        }
Example #26
0
 public void Open(string fileName)
 {
     reader   = new PdfReader(fileName);
     parser   = new PdfReaderContentParser(reader);
     listener = new MyImageRenderListener();
 }
Example #27
0
        public String ExportData()
        {
            //Document variables

            DocInfo docInfo = new DocInfo();

            System.Boolean hasOfficialUse = false;
            string         officialText;

            try
            {
                if (!ExportFilePath.isFilePathOK(".txt"))
                {
                    return("Invalid export file path: " + ExportFilePath);
                }

                BeforeProcessing();

                using (var pdfReader = new PdfReader(PdfPath))
                {
                    // For image checking
                    var parser = new PdfReaderContentParser(pdfReader);
                    ImageRenderListener listener = null;

                    // Check to see if doc has "for official use only" at the bottom
                    ITextExtractionStrategy officialTextRectangle = MakeRectangle(70, 1, 375, 120);
                    officialText = PdfTextExtractor.GetTextFromPage(pdfReader, 1, officialTextRectangle);
                    officialText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(officialText)));

                    if (officialText.ToString().ToUpper().Contains("FOROFFICIALUSEONLY"))
                    {
                        hasOfficialUse = true;
                    }
                    else
                    {
                        hasOfficialUse = false;
                    }

                    // Loop through each page of the PDF
                    for (Int32 currentPage = 1; currentPage <= pdfReader.NumberOfPages; currentPage++)
                    {
                        PageInfo currentPageInfo = new PageInfo()
                        {
                            PageNum = currentPage
                        };

                        ITextExtractionStrategy rectangleStrategy;

                        float height = pdfReader.GetPageSize(currentPage).Height;
                        float width  = pdfReader.GetPageSize(currentPage).Width;

                        if (height > 785 && height < 802 && width > 1215 && width < 1230)
                        {
                            rectangleStrategy = MakeRectangle(450, 1, 450, 70);
                        }
                        else if (height > 785 && height < 802 && width > 608 && width < 617)
                        {
                            rectangleStrategy = MakeRectangle(190, 1, 255, 74);
                        }
                        else
                        {
                            myLogger.Log("Page # " + currentPage.ToString() + " not 8.5 x 11 or 11 x 17");
                            continue;
                        }

                        string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, rectangleStrategy);
                        currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));

                        if (hasOfficialUse)
                        {
                            currentText = OfficialUseRegex.Replace(currentText, "").Trim();
                        }

                        ITextExtractionStrategy workPackageIndexStrategy = MakeRectangle(60, 600, 160, 50);
                        string WPI = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, workPackageIndexStrategy);
                        WPI = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(WPI)));

                        if (WPI.ToUpper().Contains("WORKPACKAGEINDEX"))
                        {
                            currentPageInfo.HasWpIndex = true;
                        }

                        // #-#
                        if (NumDashNumRegex.IsMatch(currentText))
                        {
                            currentPageInfo.PageNumText = NumDashNumRegex.Match(currentText).Value.Trim();
                            currentPageInfo.IsWP        = true;
                        }
                        else
                        {
                            // #-#/blank
                            if (NumDashNumBlankRegex.IsMatch(currentText))
                            {
                                currentPageInfo.PageNumText = NumDashNumBlankRegex.Match(currentText).Value.Trim();
                                currentPageInfo.IsDashBlank = true;
                                currentPageInfo.IsWP        = true;
                            }
                            else
                            {
                                if (romanNumRegex.IsMatch(currentText.ToUpper().Trim()))
                                {
                                    currentPageInfo.PageNumText = romanNumRegex.Match(currentText.ToUpper().Trim()).Value.Trim();

                                    if (String.Equals(currentPageInfo.PageNumText.ToUpper(), "C") || String.Equals(currentPageInfo.PageNumText.ToUpper(), "D"))
                                    {
                                        currentPageInfo.PageNumText = currentPageInfo.PageNumText.ToLower();
                                        currentPageInfo.IsLetter    = true;
                                    }
                                    else
                                    {
                                        currentPageInfo.IsRoman = true;
                                    }
                                }
                                else
                                {
                                    if (LetterRegex.IsMatch(currentText.Trim()))
                                    {
                                        currentPageInfo.PageNumText = LetterRegex.Match(currentText).Value.Trim();
                                        currentPageInfo.IsLetter    = true;
                                    }
                                    else
                                    {
                                        // Check if whole page is empty
                                        parser.ProcessContent(currentPage, (listener = new ImageRenderListener()));

                                        ITextExtractionStrategy currentTextRectangle = MakeRectangle(1, 1, 1000000, 1000000);

                                        String checkText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, currentTextRectangle);
                                        checkText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(checkText)));

                                        if ((listener.Images.Count <= 0) && String.IsNullOrWhiteSpace(checkText))
                                        {
                                            currentPageInfo.IsWholePageEmpty   = true;
                                            currentPageInfo.IsPageNumAreaBlank = true;
                                        }
                                        else
                                        {
                                            if (String.IsNullOrWhiteSpace(currentText))
                                            {
                                                currentPageInfo.IsPageNumAreaBlank = true;
                                            }
                                            else
                                            {
                                                if (indexRegex.IsMatch(currentText.Trim()))
                                                {
                                                    currentPageInfo.PageNumText = indexRegex.Match(currentText).Value.Trim();
                                                    currentPageInfo.IsIndex     = true;
                                                }
                                                else
                                                {
                                                    currentPageInfo.PageNumText = currentText;
                                                    currentPageInfo.IsMisc      = true;
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }

                        if (Bw.CancellationPending)
                        {
                            myLogger.Log("Processing cancelled at dwg #: " + currentPage.ToString());
                            break;
                        }

                        Bw.ReportProgress(Utils.GetPercentage(currentPage, pdfReader.NumberOfPages));

                        docInfo.Pages.Add(currentPageInfo);
                    }
                }

                WriteDocInfoToTextFile(docInfo);
            }
            catch (System.Exception se)
            {
                return(se.Message);
            }
            finally
            {
                AfterProcessing();
            }

            return(String.Concat(docInfo.ToString(),
                                 Environment.NewLine,
                                 "Processing completed in ",
                                 timer.Elapsed.TotalSeconds.PrintTimeFromSeconds(),
                                 Environment.NewLine,
                                 myLogger.ErrorCount.ToString(),
                                 " errors found."));

            //return String.Concat(
            //        docInfo.NumSheets,
            //        "Processing completed in ",
            //        timer.Elapsed.TotalSeconds.PrintTimeFromSeconds(),
            //        " with ",
            //        myLogger.ErrorCount,
            //        " errors.");
        }
        public Dictionary<System.Drawing.Image, string> Extraer_Imagenes()
        {
            Dictionary<System.Drawing.Image, string> listaImagenes = new Dictionary<System.Drawing.Image, string>();

            if(miPDF != null)
            {
                PdfReaderContentParser chekeadorPDF = new PdfReaderContentParser(miPDF.PdfLeido);
                ImageRenderListener validadorImagenes = null;

                for (byte i = 1; i <= miPDF.PdfLeido.NumberOfPages; i++)
                {
                    chekeadorPDF.ProcessContent(i, (validadorImagenes = new ImageRenderListener()));

                    if (validadorImagenes.Imagenes.Count > 0)
                    {
                        foreach (var imagenSeleccionada in validadorImagenes.Imagenes)
                        {
                            listaImagenes.Add(imagenSeleccionada.Key, imagenSeleccionada.Value);

                            //Lanzamos el Evento de Nueva Imagen Leida
                            GetImagesPdfEventArgs NewImageEvent = new GetImagesPdfEventArgs(i);
                            NewImageRead(this, NewImageEvent);
                        }
                    }
                }

                return listaImagenes;
            }

            return null;
        }
Example #29
0
 public override void Open()
 {
     ProgressPercentage = 0;
     try
     {
         PDFReaderObj = new iTextSharp.text.pdf.PdfReader(FileName);
         PDFParserObj = new PdfReaderContentParser(PDFReaderObj);
     }
     catch (Exception)
     {
         throw new UnsupportedFileFormatException();
     }
 }
Example #30
0
 /**
  * Extract text from a specified page using an extraction strategy.
  * @param reader the reader to extract text from
  * @param pageNumber the page to extract text from
  * @param strategy the strategy to use for extracting text
  * @return the extracted text
  * @throws IOException if any operation fails while reading from the provided PdfReader
  * @since 5.0.2
  */
 public static String GetTextFromPage(PdfReader reader, int pageNumber, ITextExtractionStrategy strategy)
 {
     PdfReaderContentParser parser = new PdfReaderContentParser(reader);
     return parser.ProcessContent(pageNumber, strategy).GetResultantText();
 }
Example #31
0
        public bool Read(string inputFileorFolder, string outputFolder, List <byte[]> imageBytes, List <string> imageNames, ContractParameters settings, ProgressDelegate progress)
        {
            PdfReader        reader   = null;
            PDFImageListener listener = null;

            try
            {
                reader = new PdfReader(inputFileorFolder);
                PdfReaderContentParser parser = new PdfReaderContentParser(reader);

                listener = new PDFImageListener();

                for (int i = 1; i <= reader.NumberOfPages; i++)
                {
                    listener.PageIndex = i;
                    parser.ProcessContent(i, listener);
                }

                if (settings.CheckResult && reader.NumberOfPages != listener.ImageNames.Count)
                {
                    if (settings.JoinImages)
                    {
                        string msg = CultureManager.Instance.GetLocalization("ByCode", "Convert.ImageCountingKO", "Extracting {0} : {1} images for {2} pages - Try to merge !");
                        progress(string.Format(msg, inputFileorFolder, listener.ImageNames.Count, reader.NumberOfPages));

                        ImageJoiner cp = new ImageJoiner();
                        cp.Merge(listener.ImageBytes, listener.ImageNames);

                        msg = CultureManager.Instance.GetLocalization("ByCode", "Convert.ImageMerge", "Merge to {0} new images...");
                        progress(string.Format(msg, cp.NewImageNames.Count));

                        imageBytes.AddRange(cp.NewImageBytes);
                        imageNames.AddRange(cp.NewImageNames);
                    }
                    else
                    {
                        string msg = CultureManager.Instance.GetLocalization("ByCode", "Convert.ImageError", "Error extracting {0} : {1} images for {2} pages !!");
                        progress(string.Format(msg, inputFileorFolder, listener.ImageNames.Count, reader.NumberOfPages));
                        throw new Exception("PDF check error");
                    }
                }
                else
                {
                    string msg = CultureManager.Instance.GetLocalization("ByCode", "Convert.ImageCountingOK", "Extracting {0} images in {1} pages");
                    progress(string.Format(msg, listener.ImageNames.Count, reader.NumberOfPages));

                    imageBytes.AddRange(listener.ImageBytes);
                    imageNames.AddRange(listener.ImageNames);

                    msg = CultureManager.Instance.GetLocalization("ByCode", "Convert.ImageExtracted", "{0} images extracted...");
                    progress(string.Format(msg, listener.ImageBytes.Count));
                }
            }
            catch (Exception err)
            {
                LogHelper.Manage("PDFImageReader:Read", err);
                settings.Result = false;
                listener.ImageNames.Clear();
                listener.ImageBytes.Clear();
                return(false);
            }
            finally
            {
                if (reader != null)
                {
                    reader.Close();
                }
            }
            return(true);
        }
Example #32
0
        static int Main(string[] args)
        {
            #region var declaration
            bool   verbose          = false;
            bool   datedFileNames   = false;
            bool   showRegexMatches = false;
            string inputFilename    = "";
            bool   inputFileExists;
            string extension;
            bool   isPDF            = false;
            string outputDirname    = "";
            bool   outputDirExsists = false;

            uint splitType = 0x0;

            string keySplitRegex = "";
            #endregion

            #region arg intake
            // intake args and load values in to scope
            var result = Parser.Default.ParseArguments <Options>(args);

            result.WithParsed <Options>(o =>
            {
                inputFilename = o.inputFilename;

                outputDirname = o.outputDirname;


                if (o.verbose)
                {
                    verbose = true;
                }

                if (o.datedFileNames)
                {
                    datedFileNames = true;
                }

                if (o.showRegexMatches)
                {
                    showRegexMatches = true;
                }

                if (!String.IsNullOrEmpty(o.keySplitRegex) && o.keySplitRegex.Length > 0)
                {
                    splitType     = splitType | 0x1;
                    keySplitRegex = o.keySplitRegex;
                }
            });

            #endregion

            #region verify input

            // input file location
            if (verbose)
            {
                Console.WriteLine("Input File:\t" + inputFilename);
            }
            inputFileExists = File.Exists(inputFilename);
            if (verbose)
            {
                Console.WriteLine(inputFileExists ? "File exists:\tTrue" : "File exists:\tFalse");
            }
            if (!inputFileExists)
            {
                if (verbose)
                {
                    Console.WriteLine("Input File does not exsist; Exiting with error code 1.");
                }
                #if DEBUG
                Console.ReadKey();
                #endif
                return(1);
            }

            // input file format
            extension = System.IO.Path.GetExtension(inputFilename).ToLower();
            if (verbose)
            {
                Console.WriteLine("File format:\t" + extension);
            }
            isPDF = string.Equals(extension, ".pdf");
            if (verbose)
            {
                Console.WriteLine(isPDF ? "Correct Format:\tTrue" : "Correct Format:\tFalse");
            }
            if (!isPDF)
            {
                if (verbose)
                {
                    Console.WriteLine("Input File is not a PDF; Exiting with error code 2.");
                }
                #if DEBUG
                Console.ReadKey();
                #endif
                return(2);
            }

            // output directory exsistance
            if (verbose)
            {
                Console.WriteLine("Output to:\t" + outputDirname);
            }
            outputDirExsists = Directory.Exists(outputDirname);
            if (verbose)
            {
                Console.WriteLine(outputDirExsists ? "Output valid:\tTrue" : "Output valid:\tFalse");
            }
            if (!outputDirExsists)
            {
                if (verbose)
                {
                    Console.WriteLine("Output dir does not exsist; Exiting with error code 3.");
                }
                #if DEBUG
                Console.ReadKey();
                #endif
                return(3);
            }


            #endregion

            // Split
            switch (splitType)
            {
            case 0x1:     // key match
                if (verbose)
                {
                    Console.WriteLine("split type:\tKey");
                }

                if (verbose)
                {
                    Console.WriteLine("Key regex:\t" + keySplitRegex);
                }

                Regex regex = new Regex(keySplitRegex, RegexOptions.Compiled | RegexOptions.Multiline);

                PdfReader reader = new PdfReader(inputFilename);

                PdfReaderContentParser parser = new PdfReaderContentParser(reader);

                string regexKeyMatch = "";
                int    docPageStart  = 1;
                string newDocName    = "";

                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    if (showRegexMatches)
                    {
                        Console.WriteLine("Page: " + page);
                    }

                    ITextExtractionStrategy strategy = parser.ProcessContent
                                                           (page, new SimpleTextExtractionStrategy());

                    int matchCount = 0;

                    Match match = regex.Match(strategy.GetResultantText());
                    {
                        if (showRegexMatches)
                        {
                            Console.WriteLine("Match: " + (++matchCount));
                        }
                        for (int x = 1; x <= 2; x++)
                        {
                            Group group = match.Groups[x];
                            if (showRegexMatches)
                            {
                                Console.WriteLine("Group " + x + " = '" + group + "'");
                            }
                            CaptureCollection cc = group.Captures;
                            for (int y = 0; y < cc.Count; y++)
                            {
                                Capture capture = cc[y];

                                string captureS = capture.ToString();

                                if (!string.Equals(captureS, regexKeyMatch))
                                {
                                    // if not first instance print last doc
                                    if (page > 1)
                                    {
                                        ExtractPages(inputFilename, outputDirname + newDocName, docPageStart, (page - 1));
                                    }

                                    // reset the count
                                    regexKeyMatch = captureS;
                                    if (datedFileNames)
                                    {
                                        newDocName = DateTime.Now.ToString("yyyyMMdd") + "_" + captureS + ".pdf";
                                    }
                                    else
                                    {
                                        newDocName = captureS + ".pdf";
                                    }

                                    docPageStart = page;

                                    if (verbose)
                                    {
                                        System.Console.WriteLine("New document at page:\t" + docPageStart);
                                    }
                                }

                                if (showRegexMatches)
                                {
                                    System.Console.WriteLine("Capture " + y + " = '" + capture + "', Position=" + capture.Index);
                                }
                            }
                        }
                        match = match.NextMatch();
                    }
                }

                break;

            default:
                if (verbose)
                {
                    Console.WriteLine("No valid split type selected; Exiting with error code 4.");
                }
                    #if DEBUG
                Console.ReadKey();
                    #endif
                return(4);
            }


            #if DEBUG
            Console.ReadKey();
            #endif

            return(0);
        }
Example #33
0
 public override void Close()
 {
     try
     {
         if (PDFReaderObj != null) PDFReaderObj.Close();
     }
     catch (Exception) { }
     PDFParserObj = null;
     PDFReaderObj = null;
 }