public DeepExtractionStrategy(ref PageMetrics pageMetrics)
 {
     this.pageMetrics = pageMetrics;
     charMetrices     = new List <CharMetrics>();
     startOfNewline   = true;
     descent          = pageMetrics.Height;
 }
Esempio n. 2
0
        public void AddPage(PageMetrics page)
        {
            pages.Add(page);
            nbPages++;

            if (page.Width > pageMaxWidth)
            {
                pageMaxWidth = page.Width;
            }

            if (page.Height > pageMaxHeight)
            {
                pageMaxHeight = page.Height;
            }
        }
        public static void ExtractPdf(string input, string output)
        {
            PdfDocument pdfDoc = new PdfDocument(
                new PdfReader(input), new PdfWriter(output));

            EncodingProvider codePages = CodePagesEncodingProvider.Instance;

            Encoding.RegisterProvider(codePages);

            pdfDoc.GetNumberOfPages();

            book           = new BookMetrics();
            book.Author    = pdfDoc.GetDocumentInfo().GetAuthor();
            book.Title     = pdfDoc.GetDocumentInfo().GetTitle();
            book.Publisher = pdfDoc.GetDocumentInfo().GetProducer();

            int nbPages = pdfDoc.GetNumberOfPages();

            for (int i = 0; i < nbPages; i++)
            {
                Rectangle size = pdfDoc.GetPage(i + 1).GetPageSize();

                PageMetrics page = new PageMetrics(i + 1);
                page.Width    = size.GetWidth();
                page.Height   = size.GetHeight();
                page.Rotation = pdfDoc.GetPage(i + 1).GetRotation();

                DeepExtractionStrategy strategy = new DeepExtractionStrategy(ref page);
                Console.WriteLine("Processing page {0}", i + 1);
                PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(i + 1), strategy);
                page.SortLines();
                page.WordMetrices = WordMetrics.FromChars(strategy.CharMetrices);
                page.CharMetrices = strategy.CharMetrices;
                book.AddPage(page);
            }
            pdfDoc.Close();
        }