/// <summary> /// Updates the links. /// </summary> /// <param name="pdfFilePath">The PDF file path.</param> /// <param name="htmlToPdfFiles">The HTML to PDF files.</param> /// <param name="logger">The logger.</param> internal static void UpdateLinks( string pdfFilePath, IReadOnlyCollection <HtmlToPdfFile> htmlToPdfFiles, ILogger logger) { string tempFilePath = Path.GetTempFileName(); using (PdfReader pdfReader = new PdfReader(pdfFilePath)) { using (PdfWriter pdfWriter = new PdfWriter(tempFilePath)) { using (iText.Kernel.Pdf.PdfDocument pdfDocument = new iText.Kernel.Pdf.PdfDocument(pdfReader, pdfWriter)) { int pageCount = pdfDocument.GetNumberOfPages(); for (int i = 1; i <= pageCount; i++) { // get page PdfPage pdfPage = pdfDocument.GetPage(i); // get link annotations IEnumerable <PdfLinkAnnotation> linkAnnotations = pdfPage.GetAnnotations().OfType <PdfLinkAnnotation>(); foreach (PdfLinkAnnotation linkAnnotation in linkAnnotations) { // get action PdfDictionary action = linkAnnotation.GetAction(); if (action == null) { continue; } PdfName s = action.GetAsName(PdfName.S); if (s != PdfName.URI) { continue; } PdfString uriPdfString = action.GetAsString(PdfName.URI); if (!Uri.TryCreate(uriPdfString.GetValue(), UriKind.RelativeOrAbsolute, out Uri uri)) { continue; } if (!uri.IsFile) { continue; } string htmlFilePath = uri.LocalPath.ToLower(); if (!htmlToPdfFiles.Any(x => string.Compare(x.Input, htmlFilePath, StringComparison.OrdinalIgnoreCase) == 0)) { // ex. when printing PDF from TOC.html by itself logger.LogDebug($"Could not find '{htmlFilePath}'. Referenced in '{pdfFilePath}' on page {i}."); continue; } HtmlToPdfFile linkedHtmlToPdfFile = htmlToPdfFiles.Single(x => x.Input == htmlFilePath); int linkedPageNumber = linkedHtmlToPdfFile.OutputPdfFilePageNumber; PdfPage linkedPage; try { // http://api.itextpdf.com/itext/com/itextpdf/text/pdf/PdfDestination.html linkedPage = pdfDocument.GetPage(linkedPageNumber); } catch (Exception ex) { throw new PdfPageNotFoundException(linkedPageNumber, linkedHtmlToPdfFile.Input, ex); } float top = linkedPage.GetPageSize().GetTop(); PdfExplicitDestination destination = PdfExplicitDestination.CreateFitH(linkedPage, top); PdfAction newAction = PdfAction.CreateGoTo(destination); linkAnnotation.SetAction(newAction); } } } } } File.Delete(pdfFilePath); File.Move(tempFilePath, pdfFilePath); }
/// <summary> /// Builds the outline asynchronous. /// </summary> /// <param name="coverIncluded">if set to <c>true</c> indicates a cover is included in the PDF.</param> /// <param name="tableOfContentsIncluded">if set to <c>true</c> indicates a table of contents is included in the PDF.</param> /// <param name="outputDottedLinesInTableOfContents">if set to <c>true</c> outputs dotted lines in the table of contents.</param> /// <param name="htmlToPdfFiles">The HTML to PDF files.</param> /// <param name="outlineBuilder">The outline builder.</param> /// <param name="defaultTableOfContentsStyleSheetBuilder">The default table of contents style sheet builder.</param> /// <param name="pdfPrinter">The PDF printer.</param> /// <param name="htmlToPdfOptions">The HTML to PDF options.</param> /// <param name="variables">The variables.</param> /// <returns>A <see cref="Task"/> representing the asynchronous operation.</returns> internal static async Task BuildOutlineAsync( bool coverIncluded, bool tableOfContentsIncluded, bool outputDottedLinesInTableOfContents, ConcurrentBag <HtmlToPdfFile> htmlToPdfFiles, Action <XmlWriter, IReadOnlyCollection <HtmlToPdfFile>, bool> outlineBuilder, Func <bool, string> defaultTableOfContentsStyleSheetBuilder, PdfPrinter pdfPrinter, HtmlToPdfOptions htmlToPdfOptions, Dictionary <string, string> variables) { int tocIndex = coverIncluded ? 1 : 0; int tocPageNumber = htmlToPdfFiles.Where(x => x.Index < tocIndex).Sum(x => x.NumberOfPages) + 1; foreach (HtmlToPdfFile htmlToPdfFile in htmlToPdfFiles.Where(x => x.Index >= tocIndex)) { htmlToPdfFile.Index += 1; } HtmlToPdfFile tocHtmlToPdfFile = new HtmlToPdfFile { Index = tocIndex, // TODO: extract wkhtmltopdf specific details Input = Path.Combine(Path.GetTempPath(), "__WKANCHOR_2").ToLower(), // TODO: localization Title = "Table of Contents", TitleAndHeadings = new List <HtmlHeading> { new HtmlHeading { Level = 0, Page = 0, Text = "Table of Contents", }, new HtmlHeading { Level = 1, Page = tocPageNumber, Text = "Table of Contents", }, }, }; htmlToPdfFiles.Add(tocHtmlToPdfFile); using (TempHtmlFile tempHtmlFile = new TempHtmlFile()) { string defaultTocXsl = defaultTableOfContentsStyleSheetBuilder(outputDottedLinesInTableOfContents); using (StringReader stringReader = new StringReader(defaultTocXsl)) { using (XmlReader tocXslXmlReader = XmlReader.Create(stringReader)) { XslCompiledTransform xslCompiledTransform = new XslCompiledTransform(); xslCompiledTransform.Load(tocXslXmlReader); using (MemoryStream memoryStream = new MemoryStream()) { using (XmlWriter xmlWriter = XmlWriter.Create(memoryStream)) { outlineBuilder(xmlWriter, htmlToPdfFiles, tableOfContentsIncluded); } // Reset stream position to read from the beginning memoryStream.Seek(0, SeekOrigin.Begin); using (XmlReader xmlReader = XmlReader.Create(memoryStream)) { using (XmlWriter xmlWriter = XmlWriter.Create(tempHtmlFile.FilePath)) { xslCompiledTransform.Transform(xmlReader, xmlWriter); } } } } } // print as pdf string pdfFile = await pdfPrinter.PrintAsPdfAsync( tempHtmlFile.FilePath, htmlToPdfOptions, variables, false); int numberOfPages = PdfDocument.CountNumberOfPages(pdfFile); tocHtmlToPdfFile.PdfFilePath = pdfFile; tocHtmlToPdfFile.NumberOfPages = numberOfPages; } }
/// <summary> /// Finds and sets the page numbers of links mapped to HTML headings in the specified PDF file. /// </summary> /// <param name="htmlToPdfFile">The HTML to PDF file.</param> internal static void SetHeadingPageNumbers(HtmlToPdfFile htmlToPdfFile) { using (PdfReader pdfReader = new PdfReader(htmlToPdfFile.PdfFilePath)) { using (iText.Kernel.Pdf.PdfDocument pdfDocument = new iText.Kernel.Pdf.PdfDocument(pdfReader)) { int pageCount = pdfDocument.GetNumberOfPages(); for (int i = 1; i <= pageCount; i++) { // get page PdfPage pdfPage = pdfDocument.GetPage(i); // get link annotations IEnumerable <PdfLinkAnnotation> linkAnnotations = pdfPage.GetAnnotations().OfType <PdfLinkAnnotation>(); foreach (PdfLinkAnnotation linkAnnotation in linkAnnotations) { // get action PdfDictionary action = linkAnnotation.GetAction(); if (action == null) { continue; } PdfName s = action.GetAsName(PdfName.S); if (s != PdfName.URI) { continue; } PdfString uriPdfString = action.GetAsString(PdfName.URI); if (!Uri.TryCreate(uriPdfString.GetValue(), UriKind.RelativeOrAbsolute, out Uri uri)) { continue; } if (!uri.IsFile) { continue; } // get query string NameValueCollection queryString = HttpUtility.ParseQueryString(uri.Query); // ex. ?headingLevel={level}&headingText string headingLevel = queryString["headingLevel"]; if (headingLevel == null) { continue; } if (!int.TryParse(headingLevel, out int level)) { continue; } string headingText = queryString["headingText"]; if (headingText == null) { continue; } HtmlHeading htmlHeading = htmlToPdfFile.TitleAndHeadings.SingleOrDefault(x => (x.Level == level) && (x.Text == headingText)); if (htmlHeading == null) { continue; } htmlHeading.Page = i; } } } } }
/// <summary> /// Converts the HTML files to a PDF. /// </summary> /// <param name="options">The options.</param> /// <param name="logger">The logger.</param> /// <returns>A <see cref="Task"/> representing the asynchronous operation.</returns> public static async Task <ConcurrentBag <HtmlToPdfFile> > ProcessAsync(Options options, ILogger logger) { DateTime dtNow = DateTime.Now; // local time Encoding encoding = Encoding.Default; if (!string.IsNullOrEmpty(options.Encoding)) { encoding = Encoding.GetEncoding(options.Encoding); } if (!options.Inputs.Any() && !options.DumpDefaultTocXsl) { throw new ApplicationException($"At least one input must be specified."); } if (string.IsNullOrEmpty(options.OutputFilePath) && !options.DumpDefaultTocXsl) { throw new ApplicationException($"An output must be specified."); } if (options.DumpDefaultTocXsl) { string defaultTocXsl = options.DefaultTableOfContentsStyleSheetBuilder(options.OutputDottedLinesInTableOfContents); logger.LogOutput(defaultTocXsl); } ConcurrentBag <HtmlToPdfFile> htmlToPdfFiles = new ConcurrentBag <HtmlToPdfFile>(); foreach (string input in options.Inputs) { logger.LogDebug(input); } string outputFilePath = options.OutputFilePath; options.UserStyleSheet = options.UserStyleSheet?.Trim('"'); string title = options.Title; BrowserDownloader.DownloadBrowser(logger); using (TempDirectory tempDirectory = new TempDirectory()) { var launchOptions = new LaunchOptions { SlowMo = 0, Headless = true, Timeout = 0, LogProcess = false, EnqueueTransportMessages = true, Devtools = false, WebSocketFactory = WebSocketFactory, UserDataDir = tempDirectory.DirectoryPath, Args = options.AdditionalArguments?.ToArray() ?? new string[] { }, }; MarginOptions marginOptions = new MarginOptions { Bottom = options.BottomMargin, Left = options.LeftMargin, Right = options.RightMargin, Top = options.TopMargin, }; await Policy .Handle <ProcessException>() .RetryForeverAsync(onRetry: ex => { // executed before each retry // https://github.com/hardkoded/puppeteer-sharp/issues/1509 // ex. PuppeteerSharp.ProcessException: Failed to launch Chromium! [0909/142354.872:FATAL:feature_list.cc(282)] Check failed: !g_initialized_from_accessor. // Error: Backtrace: // Error: ovly_debug_event [0x00007FFE262A1252+16183762] // Error: ovly_debug_event [0x00007FFE262A0832+16181170] // Error: ovly_debug_event [0x00007FFE262B3383+16257795] // Error: ovly_debug_event [0x00007FFE262A3386+16192262] // Error: ovly_debug_event [0x00007FFE25DF4B2E+11283118] // Error: ovly_debug_event [0x00007FFE2621DB58+15645400] // Error: ovly_debug_event [0x00007FFE2621DACD+15645261] // Error: ovly_debug_event [0x00007FFE26248F28+15822504] // Error: ovly_debug_event [0x00007FFE2621D35E+15643358] // Error: ovly_debug_event [0x00007FFE262483E3+15819619] // Error: ovly_debug_event [0x00007FFE262482BB+15819323] // Error: ovly_debug_event [0x00007FFE262480F2+15818866] // Error: ChromeMain [0x00007FFE253311B6+286] // Error: Ordinal0 [0x00007FF65A33275F+10079] // Error: Ordinal0 [0x00007FF65A33182D+6189] // Error: GetHandleVerifier [0x00007FF65A43B7C2+697538] // Error: BaseThreadInitThunk [0x00007FFE5B2D84D4+20] // Error: RtlUserThreadStart [0x00007FFE5B95E871+33] // ex. PuppeteerSharp.ProcessException: Failed to create connection ---> System.TimeoutException: Timeout of 30000 ms exceeded logger.LogWarning(ex.ToString()); Thread.Sleep(1000); }) .ExecuteAsync(async() => { bool coverAdded = false; using (Browser browser = await Puppeteer.LaunchAsync(launchOptions)) { try { PdfPrinter pdfPrinter = new PdfPrinter(browser, logger); // cover options HtmlToPdfOptions htmlToPdfOptions = new HtmlToPdfOptions { StyleSheet = options.UserStyleSheet, JavascriptDelayInMilliseconds = options.JavascriptDelayInMilliseconds, Landscape = options.Landscape, PaperFormat = options.PaperFormat, Height = options.PageHeight, Width = options.PageWidth, PrintBackground = options.PrintBackground, }; if (!string.IsNullOrEmpty(options.Cover) && (!coverAdded)) { // print cover string pdfFile = await pdfPrinter.PrintAsPdfAsync( options.Cover, htmlToPdfOptions, null); int numberOfPages = PdfDocument.CountNumberOfPages(pdfFile); logger.LogDebug($"Cover file \"{options.Cover}\" contains number of PDF pages: {numberOfPages}."); HtmlToPdfFile htmlToPdfFile = new HtmlToPdfFile { Input = options.Cover, Index = 0, PdfFilePath = pdfFile, PrintHeaderAndFooter = false, NumberOfPages = numberOfPages, }; htmlToPdfFiles.Add(htmlToPdfFile); coverAdded = true; } // page options htmlToPdfOptions.MarginOptions = marginOptions; // header htmlToPdfOptions.HeaderTemplateBuilder.Left = options.HeaderLeft; htmlToPdfOptions.HeaderTemplateBuilder.Center = options.HeaderCenter; htmlToPdfOptions.HeaderTemplateBuilder.Right = options.HeaderRight; string headerFontSize = options.HeaderFontSize.AppendUnits("px"); htmlToPdfOptions.HeaderTemplateBuilder.FontSize = headerFontSize; htmlToPdfOptions.HeaderTemplateBuilder.FontName = options.HeaderFontName; htmlToPdfOptions.HeaderTemplateBuilder.Html = options.HeaderHtml; // footer htmlToPdfOptions.FooterTemplateBuilder.Left = options.FooterLeft; htmlToPdfOptions.FooterTemplateBuilder.Center = options.FooterCenter; htmlToPdfOptions.FooterTemplateBuilder.Right = options.FooterRight; string footerFontSize = options.FooterFontSize.AppendUnits("px"); htmlToPdfOptions.FooterTemplateBuilder.FontSize = footerFontSize; htmlToPdfOptions.FooterTemplateBuilder.FontName = options.FooterFontName; htmlToPdfOptions.FooterTemplateBuilder.Html = options.FooterHtml; // global header/footer variables // https://chromedevtools.github.io/devtools-protocol/tot/Page/#method-printToPDF Dictionary <string, string> variables = new Dictionary <string, string> { { "page", "<span class=\"pageNumber\"></span>" }, { "date", dtNow.ToString("d") }, // M/dd/yyyy { "title", "<span class=\"title\"></span>" }, { "frompage", (options.PageOffset + 1).ToString() }, { "isodate", dtNow.ToString("yyyy-MM-dd") }, // ISO 8601 extended format { "time", dtNow.ToString("h:mm:ss tt") }, // ex. 3:58:45 PM { "doctitle", title }, }; // count the number of PDF pages each HTML file will be printed as var tasks = options.Inputs .Where(x => htmlToPdfFiles.All(y => y.Input != x)) .Select(async input => { // print as pdf // insert an empty page to avoid unexpected margins on the first page, which would affect the page count // https://stackoverflow.com/a/55480268/90287 // https://github.com/puppeteer/puppeteer/issues/2592 HtmlToPdfOptions tempHtmlToPdfOptions = htmlToPdfOptions.Copy(); tempHtmlToPdfOptions.PageOffset = 1; string pdfFile = await pdfPrinter.PrintAsPdfAsync( input, tempHtmlToPdfOptions, variables); // count the number of pages int numberOfPages = PdfDocument.CountNumberOfPages(pdfFile); logger.LogDebug($"\"{input}\" contains number of PDF pages: {numberOfPages}."); HtmlToPdfFile htmlToPdfFile = new HtmlToPdfFile { Input = input, Index = options.Inputs.IndexOf(input), PdfFilePath = pdfFile, PrintHeaderAndFooter = true, NumberOfPages = numberOfPages, }; htmlToPdfFiles.Add(htmlToPdfFile); }); await Task.WhenAll(tasks); variables.Add("topage", htmlToPdfFiles.Sum(x => x.NumberOfPages).ToString()); // update models with title and headings List <Task> updateTitleAndHeadingsTasks = new List <Task>(); foreach (HtmlToPdfFile htmlToPdfFile in htmlToPdfFiles) { updateTitleAndHeadingsTasks.Add(Task.Run(() => { // set the title and headings HtmlFileParser htmlFileParser = new HtmlFileParser(htmlToPdfFile.Input); htmlToPdfFile.TitleAndHeadings = htmlFileParser.GetTitleAndHeadings(options.AddTableOfContents); htmlToPdfFile.Title = htmlToPdfFile.TitleAndHeadings.First().Text; })); } await Task.WhenAll(updateTitleAndHeadingsTasks); // create table of contents if (options.AddTableOfContents) { await PdfOutlineBuilder.BuildOutlineAsync( coverAdded, options.AddTableOfContents, options.OutputDottedLinesInTableOfContents, htmlToPdfFiles, options.OutlineBuilder, options.DefaultTableOfContentsStyleSheetBuilder, pdfPrinter, htmlToPdfOptions, variables); } // update models and re-print HTML files to include headers/footers with page numbers tasks = htmlToPdfFiles.Select(async htmlToPdfFile => { if (string.IsNullOrEmpty(title) && (htmlToPdfFile.Index == 0)) { // set the PDF title title = htmlToPdfFile.Title; variables["doctitle"] = title; } // sum the number of pages in previous documents to get the current page number offset int currentPageNumber = htmlToPdfFiles .Where(x => x.Index < htmlToPdfFile.Index) .Sum(x => x.NumberOfPages) + 1; if ((currentPageNumber + htmlToPdfFile.NumberOfPages) <= (options.PageOffset + 1)) { logger.LogDebug($"Skipping printing {htmlToPdfFile.Input}"); htmlToPdfFile.Skip = true; return; } // print as pdf with page number offset htmlToPdfFile.OutputPdfFilePageNumber = currentPageNumber; logger.LogDebug($"'{htmlToPdfFile.Input}' mapped to output PDF file page number {currentPageNumber}."); htmlToPdfOptions.PageOffset = currentPageNumber - 1; htmlToPdfOptions.PageNumberOffset = options.PageOffset; htmlToPdfOptions.NumberOfPages = htmlToPdfFile.NumberOfPages; // TODO: only print as PDF again if topage variable is actually used in the header/footer if (htmlToPdfFile.PrintHeaderAndFooter) { // delete previously created PDF file File.Delete(htmlToPdfFile.PdfFilePath); // print as pdf string pdfFile = await pdfPrinter.PrintAsPdfAsync( htmlToPdfFile.Input, htmlToPdfOptions, variables); htmlToPdfFile.PdfFilePath = pdfFile; } // parse PDF to get heading page numbers PdfDocument.SetHeadingPageNumbers(htmlToPdfFile); }); await Task.WhenAll(tasks); } finally { await browser.CloseAsync(); } } }); } // merge pdf files List <string> pdfFilesToMerge = htmlToPdfFiles .Where(x => !x.Skip) .OrderBy(x => x.OutputPdfFilePageNumber) .Select(x => x.PdfFilePath) .ToList(); if (!string.IsNullOrEmpty(outputFilePath)) { byte[] mergedBytes = PdfMerger.Merge(pdfFilesToMerge); File.WriteAllBytes(outputFilePath, mergedBytes); try { // update external file links to internal document links PdfDocument.UpdateLinks(outputFilePath, htmlToPdfFiles, logger); } catch (Exception ex) { throw new UpdatePdfLinksException(outputFilePath, htmlToPdfFiles, ex); } PdfDocument.SetTitle(outputFilePath, title); } // delete temporary PDF files var deleteTempFileTasks = htmlToPdfFiles .Where(x => !string.IsNullOrEmpty(x.PdfFilePath)) .Select(async input => { await Task.Factory.StartNew(() => { if (File.Exists(input.PdfFilePath)) { File.Delete(input.PdfFilePath); } }); }); await Task.WhenAll(deleteTempFileTasks); return(htmlToPdfFiles); }