Exemple #1
0
        public static void Main(string[] args)
        {
            // Extract pdf content
            ContentExtract.ExtractPdf(
                "/Users/ryan/RiderProjects/Sharpen Pdf Parser/Sharpen Pdf Parser/Resources/affordances.pdf",
                Path.Combine(Path.GetTempPath(), "test.pdf"));
            BookMetrics book = ContentExtract.Book;

            book.ToJson("/Users/ryan/RiderProjects/Sharpen Pdf Parser/test.json");

            BookMetrics bookTest = BookMetrics.FromJson("/Users/ryan/RiderProjects/Sharpen Pdf Parser/test.json");

            // Remove text from pdf
            GhostScript.RemoveText("/Users/ryan/Documents/Books/affordances.pdf");

            // Convert page to svg
            Poppler.PdfToSvg("/Users/ryan/Documents/Books/affordances.pdf", "1");

            // Preparing SVG for unity - temporary until they fix issues
            Svg.RenderEmptyPathsExplicit("/Users/ryan/RiderProjects/Sharpen Pdf Parser/Sharpen Pdf Parser/Resources/test.svg");
            Svg.RenderRgbAsHex("/Users/ryan/RiderProjects/Sharpen Pdf Parser/Sharpen Pdf Parser/Resources/test.svg");
            Svg.WriteSvg("/Users/ryan/Documents/Books/parsed2.svg");
        }
        public static void ExtractPdf(string input, string output)
        {
            PdfDocument pdfDoc = new PdfDocument(
                new PdfReader(input), new PdfWriter(output));

            EncodingProvider codePages = CodePagesEncodingProvider.Instance;

            Encoding.RegisterProvider(codePages);

            pdfDoc.GetNumberOfPages();

            book           = new BookMetrics();
            book.Author    = pdfDoc.GetDocumentInfo().GetAuthor();
            book.Title     = pdfDoc.GetDocumentInfo().GetTitle();
            book.Publisher = pdfDoc.GetDocumentInfo().GetProducer();

            int nbPages = pdfDoc.GetNumberOfPages();

            for (int i = 0; i < nbPages; i++)
            {
                Rectangle size = pdfDoc.GetPage(i + 1).GetPageSize();

                PageMetrics page = new PageMetrics(i + 1);
                page.Width    = size.GetWidth();
                page.Height   = size.GetHeight();
                page.Rotation = pdfDoc.GetPage(i + 1).GetRotation();

                DeepExtractionStrategy strategy = new DeepExtractionStrategy(ref page);
                Console.WriteLine("Processing page {0}", i + 1);
                PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(i + 1), strategy);
                page.SortLines();
                page.WordMetrices = WordMetrics.FromChars(strategy.CharMetrices);
                page.CharMetrices = strategy.CharMetrices;
                book.AddPage(page);
            }
            pdfDoc.Close();
        }