Ejemplo n.º 1
0
        /// <summary>
        /// Updates the links.
        /// </summary>
        /// <param name="pdfFilePath">The PDF file path.</param>
        /// <param name="htmlToPdfFiles">The HTML to PDF files.</param>
        /// <param name="logger">The logger.</param>
        internal static void UpdateLinks(
            string pdfFilePath,
            IReadOnlyCollection <HtmlToPdfFile> htmlToPdfFiles,
            ILogger logger)
        {
            string tempFilePath = Path.GetTempFileName();

            using (PdfReader pdfReader = new PdfReader(pdfFilePath))
            {
                using (PdfWriter pdfWriter = new PdfWriter(tempFilePath))
                {
                    using (iText.Kernel.Pdf.PdfDocument pdfDocument = new iText.Kernel.Pdf.PdfDocument(pdfReader, pdfWriter))
                    {
                        int pageCount = pdfDocument.GetNumberOfPages();
                        for (int i = 1; i <= pageCount; i++)
                        {
                            // get page
                            PdfPage pdfPage = pdfDocument.GetPage(i);

                            // get link annotations
                            IEnumerable <PdfLinkAnnotation> linkAnnotations = pdfPage.GetAnnotations().OfType <PdfLinkAnnotation>();
                            foreach (PdfLinkAnnotation linkAnnotation in linkAnnotations)
                            {
                                // get action
                                PdfDictionary action = linkAnnotation.GetAction();
                                if (action == null)
                                {
                                    continue;
                                }

                                PdfName s = action.GetAsName(PdfName.S);
                                if (s != PdfName.URI)
                                {
                                    continue;
                                }

                                PdfString uriPdfString = action.GetAsString(PdfName.URI);
                                if (!Uri.TryCreate(uriPdfString.GetValue(), UriKind.RelativeOrAbsolute, out Uri uri))
                                {
                                    continue;
                                }

                                if (!uri.IsFile)
                                {
                                    continue;
                                }

                                string htmlFilePath = uri.LocalPath.ToLower();

                                if (!htmlToPdfFiles.Any(x => string.Compare(x.Input, htmlFilePath, StringComparison.OrdinalIgnoreCase) == 0))
                                {
                                    // ex. when printing PDF from TOC.html by itself
                                    logger.LogDebug($"Could not find '{htmlFilePath}'. Referenced in '{pdfFilePath}' on page {i}.");
                                    continue;
                                }

                                HtmlToPdfFile linkedHtmlToPdfFile = htmlToPdfFiles.Single(x => x.Input == htmlFilePath);
                                int           linkedPageNumber    = linkedHtmlToPdfFile.OutputPdfFilePageNumber;

                                PdfPage linkedPage;
                                try
                                {
                                    // http://api.itextpdf.com/itext/com/itextpdf/text/pdf/PdfDestination.html
                                    linkedPage = pdfDocument.GetPage(linkedPageNumber);
                                }
                                catch (Exception ex)
                                {
                                    throw new PdfPageNotFoundException(linkedPageNumber, linkedHtmlToPdfFile.Input, ex);
                                }

                                float top = linkedPage.GetPageSize().GetTop();
                                PdfExplicitDestination destination = PdfExplicitDestination.CreateFitH(linkedPage, top);
                                PdfAction newAction = PdfAction.CreateGoTo(destination);

                                linkAnnotation.SetAction(newAction);
                            }
                        }
                    }
                }
            }

            File.Delete(pdfFilePath);
            File.Move(tempFilePath, pdfFilePath);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Builds the outline asynchronous.
        /// </summary>
        /// <param name="coverIncluded">if set to <c>true</c> indicates a cover is included in the PDF.</param>
        /// <param name="tableOfContentsIncluded">if set to <c>true</c> indicates a table of contents is included in the PDF.</param>
        /// <param name="outputDottedLinesInTableOfContents">if set to <c>true</c> outputs dotted lines in the table of contents.</param>
        /// <param name="htmlToPdfFiles">The HTML to PDF files.</param>
        /// <param name="outlineBuilder">The outline builder.</param>
        /// <param name="defaultTableOfContentsStyleSheetBuilder">The default table of contents style sheet builder.</param>
        /// <param name="pdfPrinter">The PDF printer.</param>
        /// <param name="htmlToPdfOptions">The HTML to PDF options.</param>
        /// <param name="variables">The variables.</param>
        /// <returns>A <see cref="Task"/> representing the asynchronous operation.</returns>
        internal static async Task BuildOutlineAsync(
            bool coverIncluded,
            bool tableOfContentsIncluded,
            bool outputDottedLinesInTableOfContents,
            ConcurrentBag <HtmlToPdfFile> htmlToPdfFiles,
            Action <XmlWriter, IReadOnlyCollection <HtmlToPdfFile>, bool> outlineBuilder,
            Func <bool, string> defaultTableOfContentsStyleSheetBuilder,
            PdfPrinter pdfPrinter,
            HtmlToPdfOptions htmlToPdfOptions,
            Dictionary <string, string> variables)
        {
            int tocIndex      = coverIncluded ? 1 : 0;
            int tocPageNumber = htmlToPdfFiles.Where(x => x.Index < tocIndex).Sum(x => x.NumberOfPages) + 1;

            foreach (HtmlToPdfFile htmlToPdfFile in htmlToPdfFiles.Where(x => x.Index >= tocIndex))
            {
                htmlToPdfFile.Index += 1;
            }

            HtmlToPdfFile tocHtmlToPdfFile = new HtmlToPdfFile
            {
                Index = tocIndex,

                // TODO: extract wkhtmltopdf specific details
                Input = Path.Combine(Path.GetTempPath(), "__WKANCHOR_2").ToLower(),

                // TODO: localization
                Title            = "Table of Contents",
                TitleAndHeadings = new List <HtmlHeading>
                {
                    new HtmlHeading
                    {
                        Level = 0,
                        Page  = 0,
                        Text  = "Table of Contents",
                    },
                    new HtmlHeading
                    {
                        Level = 1,
                        Page  = tocPageNumber,
                        Text  = "Table of Contents",
                    },
                },
            };

            htmlToPdfFiles.Add(tocHtmlToPdfFile);

            using (TempHtmlFile tempHtmlFile = new TempHtmlFile())
            {
                string defaultTocXsl = defaultTableOfContentsStyleSheetBuilder(outputDottedLinesInTableOfContents);
                using (StringReader stringReader = new StringReader(defaultTocXsl))
                {
                    using (XmlReader tocXslXmlReader = XmlReader.Create(stringReader))
                    {
                        XslCompiledTransform xslCompiledTransform = new XslCompiledTransform();
                        xslCompiledTransform.Load(tocXslXmlReader);

                        using (MemoryStream memoryStream = new MemoryStream())
                        {
                            using (XmlWriter xmlWriter = XmlWriter.Create(memoryStream))
                            {
                                outlineBuilder(xmlWriter, htmlToPdfFiles, tableOfContentsIncluded);
                            }

                            // Reset stream position to read from the beginning
                            memoryStream.Seek(0, SeekOrigin.Begin);

                            using (XmlReader xmlReader = XmlReader.Create(memoryStream))
                            {
                                using (XmlWriter xmlWriter = XmlWriter.Create(tempHtmlFile.FilePath))
                                {
                                    xslCompiledTransform.Transform(xmlReader, xmlWriter);
                                }
                            }
                        }
                    }
                }

                // print as pdf
                string pdfFile = await pdfPrinter.PrintAsPdfAsync(
                    tempHtmlFile.FilePath,
                    htmlToPdfOptions,
                    variables,
                    false);

                int numberOfPages = PdfDocument.CountNumberOfPages(pdfFile);

                tocHtmlToPdfFile.PdfFilePath   = pdfFile;
                tocHtmlToPdfFile.NumberOfPages = numberOfPages;
            }
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Finds and sets the page numbers of links mapped to HTML headings in the specified PDF file.
        /// </summary>
        /// <param name="htmlToPdfFile">The HTML to PDF file.</param>
        internal static void SetHeadingPageNumbers(HtmlToPdfFile htmlToPdfFile)
        {
            using (PdfReader pdfReader = new PdfReader(htmlToPdfFile.PdfFilePath))
            {
                using (iText.Kernel.Pdf.PdfDocument pdfDocument = new iText.Kernel.Pdf.PdfDocument(pdfReader))
                {
                    int pageCount = pdfDocument.GetNumberOfPages();
                    for (int i = 1; i <= pageCount; i++)
                    {
                        // get page
                        PdfPage pdfPage = pdfDocument.GetPage(i);

                        // get link annotations
                        IEnumerable <PdfLinkAnnotation> linkAnnotations = pdfPage.GetAnnotations().OfType <PdfLinkAnnotation>();
                        foreach (PdfLinkAnnotation linkAnnotation in linkAnnotations)
                        {
                            // get action
                            PdfDictionary action = linkAnnotation.GetAction();
                            if (action == null)
                            {
                                continue;
                            }

                            PdfName s = action.GetAsName(PdfName.S);
                            if (s != PdfName.URI)
                            {
                                continue;
                            }

                            PdfString uriPdfString = action.GetAsString(PdfName.URI);
                            if (!Uri.TryCreate(uriPdfString.GetValue(), UriKind.RelativeOrAbsolute, out Uri uri))
                            {
                                continue;
                            }

                            if (!uri.IsFile)
                            {
                                continue;
                            }

                            // get query string
                            NameValueCollection queryString = HttpUtility.ParseQueryString(uri.Query);

                            // ex. ?headingLevel={level}&headingText
                            string headingLevel = queryString["headingLevel"];
                            if (headingLevel == null)
                            {
                                continue;
                            }

                            if (!int.TryParse(headingLevel, out int level))
                            {
                                continue;
                            }

                            string headingText = queryString["headingText"];
                            if (headingText == null)
                            {
                                continue;
                            }

                            HtmlHeading htmlHeading = htmlToPdfFile.TitleAndHeadings.SingleOrDefault(x => (x.Level == level) && (x.Text == headingText));
                            if (htmlHeading == null)
                            {
                                continue;
                            }

                            htmlHeading.Page = i;
                        }
                    }
                }
            }
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Converts the HTML files to a PDF.
        /// </summary>
        /// <param name="options">The options.</param>
        /// <param name="logger">The logger.</param>
        /// <returns>A <see cref="Task"/> representing the asynchronous operation.</returns>
        public static async Task <ConcurrentBag <HtmlToPdfFile> > ProcessAsync(Options options, ILogger logger)
        {
            DateTime dtNow = DateTime.Now; // local time

            Encoding encoding = Encoding.Default;

            if (!string.IsNullOrEmpty(options.Encoding))
            {
                encoding = Encoding.GetEncoding(options.Encoding);
            }

            if (!options.Inputs.Any() &&
                !options.DumpDefaultTocXsl)
            {
                throw new ApplicationException($"At least one input must be specified.");
            }

            if (string.IsNullOrEmpty(options.OutputFilePath) &&
                !options.DumpDefaultTocXsl)
            {
                throw new ApplicationException($"An output must be specified.");
            }

            if (options.DumpDefaultTocXsl)
            {
                string defaultTocXsl = options.DefaultTableOfContentsStyleSheetBuilder(options.OutputDottedLinesInTableOfContents);
                logger.LogOutput(defaultTocXsl);
            }

            ConcurrentBag <HtmlToPdfFile> htmlToPdfFiles = new ConcurrentBag <HtmlToPdfFile>();

            foreach (string input in options.Inputs)
            {
                logger.LogDebug(input);
            }

            string outputFilePath = options.OutputFilePath;

            options.UserStyleSheet = options.UserStyleSheet?.Trim('"');

            string title = options.Title;

            BrowserDownloader.DownloadBrowser(logger);

            using (TempDirectory tempDirectory = new TempDirectory())
            {
                var launchOptions = new LaunchOptions
                {
                    SlowMo     = 0,
                    Headless   = true,
                    Timeout    = 0,
                    LogProcess = false,
                    EnqueueTransportMessages = true,
                    Devtools         = false,
                    WebSocketFactory = WebSocketFactory,
                    UserDataDir      = tempDirectory.DirectoryPath,
                    Args             = options.AdditionalArguments?.ToArray() ?? new string[] { },
                };

                MarginOptions marginOptions = new MarginOptions
                {
                    Bottom = options.BottomMargin,
                    Left   = options.LeftMargin,
                    Right  = options.RightMargin,
                    Top    = options.TopMargin,
                };

                await Policy
                .Handle <ProcessException>()
                .RetryForeverAsync(onRetry: ex =>
                {
                    // executed before each retry
                    // https://github.com/hardkoded/puppeteer-sharp/issues/1509
                    // ex. PuppeteerSharp.ProcessException: Failed to launch Chromium! [0909/142354.872:FATAL:feature_list.cc(282)] Check failed: !g_initialized_from_accessor.
                    // Error: Backtrace:
                    // Error:   ovly_debug_event [0x00007FFE262A1252+16183762]
                    // Error:   ovly_debug_event [0x00007FFE262A0832+16181170]
                    // Error:   ovly_debug_event [0x00007FFE262B3383+16257795]
                    // Error:   ovly_debug_event [0x00007FFE262A3386+16192262]
                    // Error:   ovly_debug_event [0x00007FFE25DF4B2E+11283118]
                    // Error:   ovly_debug_event [0x00007FFE2621DB58+15645400]
                    // Error:   ovly_debug_event [0x00007FFE2621DACD+15645261]
                    // Error:   ovly_debug_event [0x00007FFE26248F28+15822504]
                    // Error:   ovly_debug_event [0x00007FFE2621D35E+15643358]
                    // Error:   ovly_debug_event [0x00007FFE262483E3+15819619]
                    // Error:   ovly_debug_event [0x00007FFE262482BB+15819323]
                    // Error:   ovly_debug_event [0x00007FFE262480F2+15818866]
                    // Error:   ChromeMain [0x00007FFE253311B6+286]
                    // Error:   Ordinal0 [0x00007FF65A33275F+10079]
                    // Error:   Ordinal0 [0x00007FF65A33182D+6189]
                    // Error:   GetHandleVerifier [0x00007FF65A43B7C2+697538]
                    // Error:   BaseThreadInitThunk [0x00007FFE5B2D84D4+20]
                    // Error:   RtlUserThreadStart [0x00007FFE5B95E871+33]
                    // ex. PuppeteerSharp.ProcessException: Failed to create connection ---> System.TimeoutException: Timeout of 30000 ms exceeded
                    logger.LogWarning(ex.ToString());
                    Thread.Sleep(1000);
                })
                .ExecuteAsync(async() =>
                {
                    bool coverAdded = false;

                    using (Browser browser = await Puppeteer.LaunchAsync(launchOptions))
                    {
                        try
                        {
                            PdfPrinter pdfPrinter = new PdfPrinter(browser, logger);

                            // cover options
                            HtmlToPdfOptions htmlToPdfOptions = new HtmlToPdfOptions
                            {
                                StyleSheet = options.UserStyleSheet,
                                JavascriptDelayInMilliseconds = options.JavascriptDelayInMilliseconds,
                                Landscape       = options.Landscape,
                                PaperFormat     = options.PaperFormat,
                                Height          = options.PageHeight,
                                Width           = options.PageWidth,
                                PrintBackground = options.PrintBackground,
                            };

                            if (!string.IsNullOrEmpty(options.Cover) && (!coverAdded))
                            {
                                // print cover
                                string pdfFile = await pdfPrinter.PrintAsPdfAsync(
                                    options.Cover,
                                    htmlToPdfOptions,
                                    null);

                                int numberOfPages = PdfDocument.CountNumberOfPages(pdfFile);

                                logger.LogDebug($"Cover file \"{options.Cover}\" contains number of PDF pages: {numberOfPages}.");

                                HtmlToPdfFile htmlToPdfFile = new HtmlToPdfFile
                                {
                                    Input                = options.Cover,
                                    Index                = 0,
                                    PdfFilePath          = pdfFile,
                                    PrintHeaderAndFooter = false,
                                    NumberOfPages        = numberOfPages,
                                };

                                htmlToPdfFiles.Add(htmlToPdfFile);

                                coverAdded = true;
                            }

                            // page options
                            htmlToPdfOptions.MarginOptions = marginOptions;

                            // header
                            htmlToPdfOptions.HeaderTemplateBuilder.Left   = options.HeaderLeft;
                            htmlToPdfOptions.HeaderTemplateBuilder.Center = options.HeaderCenter;
                            htmlToPdfOptions.HeaderTemplateBuilder.Right  = options.HeaderRight;

                            string headerFontSize = options.HeaderFontSize.AppendUnits("px");

                            htmlToPdfOptions.HeaderTemplateBuilder.FontSize = headerFontSize;
                            htmlToPdfOptions.HeaderTemplateBuilder.FontName = options.HeaderFontName;
                            htmlToPdfOptions.HeaderTemplateBuilder.Html     = options.HeaderHtml;

                            // footer
                            htmlToPdfOptions.FooterTemplateBuilder.Left   = options.FooterLeft;
                            htmlToPdfOptions.FooterTemplateBuilder.Center = options.FooterCenter;
                            htmlToPdfOptions.FooterTemplateBuilder.Right  = options.FooterRight;

                            string footerFontSize = options.FooterFontSize.AppendUnits("px");

                            htmlToPdfOptions.FooterTemplateBuilder.FontSize = footerFontSize;
                            htmlToPdfOptions.FooterTemplateBuilder.FontName = options.FooterFontName;
                            htmlToPdfOptions.FooterTemplateBuilder.Html     = options.FooterHtml;

                            // global header/footer variables
                            // https://chromedevtools.github.io/devtools-protocol/tot/Page/#method-printToPDF
                            Dictionary <string, string> variables = new Dictionary <string, string>
                            {
                                { "page", "<span class=\"pageNumber\"></span>" },
                                { "date", dtNow.ToString("d") },     // M/dd/yyyy
                                { "title", "<span class=\"title\"></span>" },
                                { "frompage", (options.PageOffset + 1).ToString() },
                                { "isodate", dtNow.ToString("yyyy-MM-dd") },  // ISO 8601 extended format
                                { "time", dtNow.ToString("h:mm:ss tt") },     // ex. 3:58:45 PM
                                { "doctitle", title },
                            };

                            // count the number of PDF pages each HTML file will be printed as
                            var tasks = options.Inputs
                                        .Where(x => htmlToPdfFiles.All(y => y.Input != x))
                                        .Select(async input =>
                            {
                                // print as pdf
                                // insert an empty page to avoid unexpected margins on the first page, which would affect the page count
                                // https://stackoverflow.com/a/55480268/90287
                                // https://github.com/puppeteer/puppeteer/issues/2592
                                HtmlToPdfOptions tempHtmlToPdfOptions = htmlToPdfOptions.Copy();
                                tempHtmlToPdfOptions.PageOffset       = 1;

                                string pdfFile = await pdfPrinter.PrintAsPdfAsync(
                                    input,
                                    tempHtmlToPdfOptions,
                                    variables);

                                // count the number of pages
                                int numberOfPages = PdfDocument.CountNumberOfPages(pdfFile);
                                logger.LogDebug($"\"{input}\" contains number of PDF pages: {numberOfPages}.");

                                HtmlToPdfFile htmlToPdfFile = new HtmlToPdfFile
                                {
                                    Input                = input,
                                    Index                = options.Inputs.IndexOf(input),
                                    PdfFilePath          = pdfFile,
                                    PrintHeaderAndFooter = true,
                                    NumberOfPages        = numberOfPages,
                                };

                                htmlToPdfFiles.Add(htmlToPdfFile);
                            });

                            await Task.WhenAll(tasks);

                            variables.Add("topage", htmlToPdfFiles.Sum(x => x.NumberOfPages).ToString());

                            // update models with title and headings
                            List <Task> updateTitleAndHeadingsTasks = new List <Task>();

                            foreach (HtmlToPdfFile htmlToPdfFile in htmlToPdfFiles)
                            {
                                updateTitleAndHeadingsTasks.Add(Task.Run(() =>
                                {
                                    // set the title and headings
                                    HtmlFileParser htmlFileParser  = new HtmlFileParser(htmlToPdfFile.Input);
                                    htmlToPdfFile.TitleAndHeadings = htmlFileParser.GetTitleAndHeadings(options.AddTableOfContents);
                                    htmlToPdfFile.Title            = htmlToPdfFile.TitleAndHeadings.First().Text;
                                }));
                            }

                            await Task.WhenAll(updateTitleAndHeadingsTasks);

                            // create table of contents
                            if (options.AddTableOfContents)
                            {
                                await PdfOutlineBuilder.BuildOutlineAsync(
                                    coverAdded,
                                    options.AddTableOfContents,
                                    options.OutputDottedLinesInTableOfContents,
                                    htmlToPdfFiles,
                                    options.OutlineBuilder,
                                    options.DefaultTableOfContentsStyleSheetBuilder,
                                    pdfPrinter,
                                    htmlToPdfOptions,
                                    variables);
                            }

                            // update models and re-print HTML files to include headers/footers with page numbers
                            tasks = htmlToPdfFiles.Select(async htmlToPdfFile =>
                            {
                                if (string.IsNullOrEmpty(title) &&
                                    (htmlToPdfFile.Index == 0))
                                {
                                    // set the PDF title
                                    title = htmlToPdfFile.Title;
                                    variables["doctitle"] = title;
                                }

                                // sum the number of pages in previous documents to get the current page number offset
                                int currentPageNumber = htmlToPdfFiles
                                                        .Where(x => x.Index < htmlToPdfFile.Index)
                                                        .Sum(x => x.NumberOfPages) + 1;

                                if ((currentPageNumber + htmlToPdfFile.NumberOfPages) <= (options.PageOffset + 1))
                                {
                                    logger.LogDebug($"Skipping printing {htmlToPdfFile.Input}");
                                    htmlToPdfFile.Skip = true;
                                    return;
                                }

                                // print as pdf with page number offset
                                htmlToPdfFile.OutputPdfFilePageNumber = currentPageNumber;

                                logger.LogDebug($"'{htmlToPdfFile.Input}' mapped to output PDF file page number {currentPageNumber}.");

                                htmlToPdfOptions.PageOffset       = currentPageNumber - 1;
                                htmlToPdfOptions.PageNumberOffset = options.PageOffset;
                                htmlToPdfOptions.NumberOfPages    = htmlToPdfFile.NumberOfPages;

                                // TODO: only print as PDF again if topage variable is actually used in the header/footer
                                if (htmlToPdfFile.PrintHeaderAndFooter)
                                {
                                    // delete previously created PDF file
                                    File.Delete(htmlToPdfFile.PdfFilePath);

                                    // print as pdf
                                    string pdfFile = await pdfPrinter.PrintAsPdfAsync(
                                        htmlToPdfFile.Input,
                                        htmlToPdfOptions,
                                        variables);

                                    htmlToPdfFile.PdfFilePath = pdfFile;
                                }

                                // parse PDF to get heading page numbers
                                PdfDocument.SetHeadingPageNumbers(htmlToPdfFile);
                            });

                            await Task.WhenAll(tasks);
                        }
                        finally
                        {
                            await browser.CloseAsync();
                        }
                    }
                });
            }

            // merge pdf files
            List <string> pdfFilesToMerge = htmlToPdfFiles
                                            .Where(x => !x.Skip)
                                            .OrderBy(x => x.OutputPdfFilePageNumber)
                                            .Select(x => x.PdfFilePath)
                                            .ToList();

            if (!string.IsNullOrEmpty(outputFilePath))
            {
                byte[] mergedBytes = PdfMerger.Merge(pdfFilesToMerge);

                File.WriteAllBytes(outputFilePath, mergedBytes);

                try
                {
                    // update external file links to internal document links
                    PdfDocument.UpdateLinks(outputFilePath, htmlToPdfFiles, logger);
                }
                catch (Exception ex)
                {
                    throw new UpdatePdfLinksException(outputFilePath, htmlToPdfFiles, ex);
                }

                PdfDocument.SetTitle(outputFilePath, title);
            }

            // delete temporary PDF files
            var deleteTempFileTasks = htmlToPdfFiles
                                      .Where(x => !string.IsNullOrEmpty(x.PdfFilePath))
                                      .Select(async input =>
            {
                await Task.Factory.StartNew(() =>
                {
                    if (File.Exists(input.PdfFilePath))
                    {
                        File.Delete(input.PdfFilePath);
                    }
                });
            });

            await Task.WhenAll(deleteTempFileTasks);

            return(htmlToPdfFiles);
        }