public DeepExtractionStrategy(ref PageMetrics pageMetrics) { this.pageMetrics = pageMetrics; charMetrices = new List <CharMetrics>(); startOfNewline = true; descent = pageMetrics.Height; }
public void AddPage(PageMetrics page) { pages.Add(page); nbPages++; if (page.Width > pageMaxWidth) { pageMaxWidth = page.Width; } if (page.Height > pageMaxHeight) { pageMaxHeight = page.Height; } }
public static void ExtractPdf(string input, string output) { PdfDocument pdfDoc = new PdfDocument( new PdfReader(input), new PdfWriter(output)); EncodingProvider codePages = CodePagesEncodingProvider.Instance; Encoding.RegisterProvider(codePages); pdfDoc.GetNumberOfPages(); book = new BookMetrics(); book.Author = pdfDoc.GetDocumentInfo().GetAuthor(); book.Title = pdfDoc.GetDocumentInfo().GetTitle(); book.Publisher = pdfDoc.GetDocumentInfo().GetProducer(); int nbPages = pdfDoc.GetNumberOfPages(); for (int i = 0; i < nbPages; i++) { Rectangle size = pdfDoc.GetPage(i + 1).GetPageSize(); PageMetrics page = new PageMetrics(i + 1); page.Width = size.GetWidth(); page.Height = size.GetHeight(); page.Rotation = pdfDoc.GetPage(i + 1).GetRotation(); DeepExtractionStrategy strategy = new DeepExtractionStrategy(ref page); Console.WriteLine("Processing page {0}", i + 1); PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(i + 1), strategy); page.SortLines(); page.WordMetrices = WordMetrics.FromChars(strategy.CharMetrices); page.CharMetrices = strategy.CharMetrices; book.AddPage(page); } pdfDoc.Close(); }