public static void CreateXmpMetadata(PdfDocument document, PdfCompat compat) { var metadataDict = new PdfDictionary(document); metadataDict.Elements["/Type"] = new PdfName("/Metadata"); metadataDict.Elements["/Subtype"] = new PdfName("/XML"); metadataDict.CreateStream(CreateRawXmpMetadata(document.Info, GetConformance(compat))); document.Internals.AddObject(metadataDict); document.Internals.Catalog.Elements["/Metadata"] = metadataDict.Reference; }
private static void DrawImageOnPage(PdfPage page, XImage img, PdfCompat compat) { if (compat != PdfCompat.Default) { img.Interpolate = false; } Size realSize = GetRealSize(img); page.Width = realSize.Width; page.Height = realSize.Height; using (XGraphics gfx = XGraphics.FromPdfPage(page)) { gfx.DrawImage(img, 0, 0, realSize.Width, realSize.Height); } }
private static (string, string) GetConformance(PdfCompat compat) { switch (compat) { case PdfCompat.PdfA1B: return("1", "B"); case PdfCompat.PdfA2B: return("2", "B"); case PdfCompat.PdfA3B: return("3", "B"); case PdfCompat.PdfA3U: return("3", "U"); default: return("", ""); } }
private bool BuildDocumentWithOcr(Func<int, bool> progressCallback, PdfDocument document, PdfCompat compat, IEnumerable<ScannedImage> images, string ocrLanguageCode) { // Use a pipeline so that multiple pages/images can be processed in parallel // Note: No locks needed on the document because the design of the pipeline ensures no two threads will work on it at once int progress = 0; Pipeline.For(images).Step(image => { // Step 1: Load the image into memory, draw it on a new PDF page, and save a copy of the processed image to disk for OCR if (!progressCallback(progress)) { return null; } bool importedPdfPassThrough = image.FileFormat == null && !image.RecoveryIndexImage.TransformList.Any(); PdfPage page; if (importedPdfPassThrough) { page = CopyPdfPageToDoc(document, image); // Scan through the page looking for text var elements = page.Contents.Elements; for (int i = 0; i < elements.Count; i++) { string textAndFormatting = elements.GetDictionary(i).Stream.ToString(); var reader = new StringReader(textAndFormatting); bool inTextBlock = false; string line; while ((line = reader.ReadLine()) != null) { if (line.EndsWith("BT")) { inTextBlock = true; } else if (line.EndsWith("ET")) { inTextBlock = false; } else if (inTextBlock && (line.EndsWith("TJ") || line.EndsWith("Tj") || line.EndsWith("\"") || line.EndsWith("'"))) { // Text-showing operators // Since this page already contains text, don't use OCR return null; } } } } else { page = document.AddPage(); } using (Stream stream = scannedImageRenderer.RenderToStream(image)) using (var img = XImage.FromStream(stream)) { if (!progressCallback(progress)) { return null; } if (!importedPdfPassThrough) { DrawImageOnPage(page, img, compat); } if (!progressCallback(progress)) { return null; } string tempImageFilePath = Path.Combine(Paths.Temp, Path.GetRandomFileName()); img.GdiImage.Save(tempImageFilePath); return Tuple.Create(page, tempImageFilePath); } }).StepParallel((page, tempImageFilePath) => { // Step 2: Run OCR on the processsed image file // This step is doubly parallel since not only can it run alongside other stages of the pipeline, // multiple files can also be OCR'd at once (no interdependencies, it doesn't touch the document) OcrResult ocrResult; try { if (!progressCallback(progress)) { return null; } // ReSharper disable once AccessToModifiedClosure ocrResult = ocrEngine.ProcessImage(tempImageFilePath, ocrLanguageCode, () => !progressCallback(progress)); } finally { File.Delete(tempImageFilePath); } // The final pipeline step is pretty fast, so updating progress here is more accurate if (progressCallback(progress)) { Interlocked.Increment(ref progress); progressCallback(progress); } return Tuple.Create(page, ocrResult); }).StepBlock().Run((page, ocrResult) => { // Step 3: Draw the OCR text on the PDF page if (ocrResult == null) { return; } if (!progressCallback(progress)) { return; } DrawOcrTextOnPage(page, ocrResult); }); return progressCallback(progress); }
private bool BuildDocumentWithoutOcr(Func<int, bool> progressCallback, PdfDocument document, PdfCompat compat, IEnumerable<ScannedImage> images) { int progress = 0; foreach (var image in images) { bool importedPdfPassThrough = image.FileFormat == null && !image.RecoveryIndexImage.TransformList.Any(); if (importedPdfPassThrough) { CopyPdfPageToDoc(document, image); } else { using (Stream stream = scannedImageRenderer.RenderToStream(image)) using (var img = XImage.FromStream(stream)) { if (!progressCallback(progress)) { return false; } PdfPage page = document.AddPage(); DrawImageOnPage(page, img, compat); } } progress++; } return true; }
private bool BuildDocumentWithOcr(ProgressHandler progressCallback, CancellationToken cancelToken, PdfDocument document, PdfCompat compat, ICollection <ScannedImage.Snapshot> snapshots, IOcrEngine ocrEngine, OcrParams ocrParams) { int progress = 0; progressCallback(progress, snapshots.Count); List <(PdfPage, Task <OcrResult>)> ocrPairs = new List <(PdfPage, Task <OcrResult>)>(); // Step 1: Create the pages, draw the images, and start OCR foreach (var snapshot in snapshots) { if (cancelToken.IsCancellationRequested) { break; } bool importedPdfPassThrough = snapshot.Source.FileFormat == null && !snapshot.TransformList.Any(); PdfPage page; if (importedPdfPassThrough) { page = CopyPdfPageToDoc(document, snapshot.Source); if (PageContainsText(page)) { // Since this page already contains text, don't use OCR continue; } } else { page = document.AddPage(); } string tempImageFilePath = Path.Combine(Paths.Temp, Path.GetRandomFileName()); using (Stream stream = scannedImageRenderer.RenderToStream(snapshot).Result) using (var img = XImage.FromStream(stream)) { if (cancelToken.IsCancellationRequested) { break; } if (!importedPdfPassThrough) { DrawImageOnPage(page, img, compat); } if (cancelToken.IsCancellationRequested) { break; } if (!ocrRequestQueue.HasCachedResult(ocrEngine, snapshot, ocrParams)) { img.GdiImage.Save(tempImageFilePath); } } if (cancelToken.IsCancellationRequested) { File.Delete(tempImageFilePath); break; } // Start OCR var ocrTask = ocrRequestQueue.QueueForeground(ocrEngine, snapshot, tempImageFilePath, ocrParams, cancelToken); ocrTask.ContinueWith(task => { // This is the best place to put progress reporting // Long-running OCR is done, and drawing text on the page (step 2) is very fast if (!cancelToken.IsCancellationRequested) { Interlocked.Increment(ref progress); progressCallback(progress, snapshots.Count); } }, TaskContinuationOptions.ExecuteSynchronously); // Record the page and task for step 2 ocrPairs.Add((page, ocrTask)); } // Step 2: Wait for all the OCR results, and draw the text on each page foreach (var(page, ocrTask) in ocrPairs) { if (cancelToken.IsCancellationRequested) { break; } if (ocrTask.Result == null) { continue; } DrawOcrTextOnPage(page, ocrTask.Result); } return(!cancelToken.IsCancellationRequested); }
private bool BuildDocumentWithoutOcr(ProgressHandler progressCallback, CancellationToken cancelToken, PdfDocument document, PdfCompat compat, ICollection <ScannedImage.Snapshot> snapshots) { int progress = 0; progressCallback(progress, snapshots.Count); foreach (var snapshot in snapshots) { bool importedPdfPassThrough = snapshot.Source.FileFormat == null && !snapshot.TransformList.Any(); if (importedPdfPassThrough) { CopyPdfPageToDoc(document, snapshot.Source); } else { using (Stream stream = scannedImageRenderer.RenderToStream(snapshot).Result) using (var img = XImage.FromStream(stream)) { if (cancelToken.IsCancellationRequested) { return(false); } PdfPage page = document.AddPage(); DrawImageOnPage(page, img, compat); } } progress++; progressCallback(progress, snapshots.Count); } return(true); }