Пример #1
0
        public static void CreateXmpMetadata(PdfDocument document, PdfCompat compat)
        {
            var metadataDict = new PdfDictionary(document);

            metadataDict.Elements["/Type"]    = new PdfName("/Metadata");
            metadataDict.Elements["/Subtype"] = new PdfName("/XML");
            metadataDict.CreateStream(CreateRawXmpMetadata(document.Info, GetConformance(compat)));
            document.Internals.AddObject(metadataDict);
            document.Internals.Catalog.Elements["/Metadata"] = metadataDict.Reference;
        }
Пример #2
0
 private static void DrawImageOnPage(PdfPage page, XImage img, PdfCompat compat)
 {
     if (compat != PdfCompat.Default)
     {
         img.Interpolate = false;
     }
     Size realSize = GetRealSize(img);
     page.Width = realSize.Width;
     page.Height = realSize.Height;
     using (XGraphics gfx = XGraphics.FromPdfPage(page))
     {
         gfx.DrawImage(img, 0, 0, realSize.Width, realSize.Height);
     }
 }
Пример #3
0
        private static (string, string) GetConformance(PdfCompat compat)
        {
            switch (compat)
            {
            case PdfCompat.PdfA1B:
                return("1", "B");

            case PdfCompat.PdfA2B:
                return("2", "B");

            case PdfCompat.PdfA3B:
                return("3", "B");

            case PdfCompat.PdfA3U:
                return("3", "U");

            default:
                return("", "");
            }
        }
Пример #4
0
        private bool BuildDocumentWithOcr(Func<int, bool> progressCallback, PdfDocument document, PdfCompat compat, IEnumerable<ScannedImage> images, string ocrLanguageCode)
        {
            // Use a pipeline so that multiple pages/images can be processed in parallel
            // Note: No locks needed on the document because the design of the pipeline ensures no two threads will work on it at once

            int progress = 0;
            Pipeline.For(images).Step(image =>
            {
                // Step 1: Load the image into memory, draw it on a new PDF page, and save a copy of the processed image to disk for OCR

                if (!progressCallback(progress))
                {
                    return null;
                }

                bool importedPdfPassThrough = image.FileFormat == null && !image.RecoveryIndexImage.TransformList.Any();

                PdfPage page;
                if (importedPdfPassThrough)
                {
                    page = CopyPdfPageToDoc(document, image);

                    // Scan through the page looking for text
                    var elements = page.Contents.Elements;
                    for (int i = 0; i < elements.Count; i++)
                    {
                        string textAndFormatting = elements.GetDictionary(i).Stream.ToString();
                        var reader = new StringReader(textAndFormatting);
                        bool inTextBlock = false;
                        string line;
                        while ((line = reader.ReadLine()) != null)
                        {
                            if (line.EndsWith("BT"))
                            {
                                inTextBlock = true;
                            }
                            else if (line.EndsWith("ET"))
                            {
                                inTextBlock = false;
                            }
                            else if (inTextBlock &&
                                          (line.EndsWith("TJ") || line.EndsWith("Tj")
                                           || line.EndsWith("\"") || line.EndsWith("'")))
                            {
                                // Text-showing operators
                                // Since this page already contains text, don't use OCR
                                return null;
                            }
                        }
                    }
                }
                else
                {
                    page = document.AddPage();
                }

                using (Stream stream = scannedImageRenderer.RenderToStream(image))
                using (var img = XImage.FromStream(stream))
                {
                    if (!progressCallback(progress))
                    {
                        return null;
                    }

                    if (!importedPdfPassThrough)
                    {
                        DrawImageOnPage(page, img, compat);
                    }

                    if (!progressCallback(progress))
                    {
                        return null;
                    }

                    string tempImageFilePath = Path.Combine(Paths.Temp, Path.GetRandomFileName());
                    img.GdiImage.Save(tempImageFilePath);

                    return Tuple.Create(page, tempImageFilePath);
                }
            }).StepParallel((page, tempImageFilePath) =>
            {
                // Step 2: Run OCR on the processsed image file
                // This step is doubly parallel since not only can it run alongside other stages of the pipeline,
                // multiple files can also be OCR'd at once (no interdependencies, it doesn't touch the document)

                OcrResult ocrResult;
                try
                {
                    if (!progressCallback(progress))
                    {
                        return null;
                    }
                    
                    // ReSharper disable once AccessToModifiedClosure
                    ocrResult = ocrEngine.ProcessImage(tempImageFilePath, ocrLanguageCode, () => !progressCallback(progress));
                }
                finally
                {
                    File.Delete(tempImageFilePath);
                }

                // The final pipeline step is pretty fast, so updating progress here is more accurate
                if (progressCallback(progress))
                {
                    Interlocked.Increment(ref progress);
                    progressCallback(progress);
                }

                return Tuple.Create(page, ocrResult);
            }).StepBlock().Run((page, ocrResult) =>
            {
                // Step 3: Draw the OCR text on the PDF page

                if (ocrResult == null)
                {
                    return;
                }
                if (!progressCallback(progress))
                {
                    return;
                }
                DrawOcrTextOnPage(page, ocrResult);
            });
            return progressCallback(progress);
        }
Пример #5
0
        private bool BuildDocumentWithoutOcr(Func<int, bool> progressCallback, PdfDocument document, PdfCompat compat, IEnumerable<ScannedImage> images)
        {
            int progress = 0;
            foreach (var image in images)
            {
                bool importedPdfPassThrough = image.FileFormat == null && !image.RecoveryIndexImage.TransformList.Any();

                if (importedPdfPassThrough)
                {
                    CopyPdfPageToDoc(document, image);
                }
                else
                {
                    using (Stream stream = scannedImageRenderer.RenderToStream(image))
                    using (var img = XImage.FromStream(stream))
                    {
                        if (!progressCallback(progress))
                        {
                            return false;
                        }

                        PdfPage page = document.AddPage();
                        DrawImageOnPage(page, img, compat);
                    }
                }
                progress++;
            }
            return true;
        }
Пример #6
0
        private bool BuildDocumentWithOcr(ProgressHandler progressCallback, CancellationToken cancelToken, PdfDocument document, PdfCompat compat, ICollection <ScannedImage.Snapshot> snapshots, IOcrEngine ocrEngine, OcrParams ocrParams)
        {
            int progress = 0;

            progressCallback(progress, snapshots.Count);

            List <(PdfPage, Task <OcrResult>)> ocrPairs = new List <(PdfPage, Task <OcrResult>)>();

            // Step 1: Create the pages, draw the images, and start OCR
            foreach (var snapshot in snapshots)
            {
                if (cancelToken.IsCancellationRequested)
                {
                    break;
                }

                bool importedPdfPassThrough = snapshot.Source.FileFormat == null && !snapshot.TransformList.Any();

                PdfPage page;
                if (importedPdfPassThrough)
                {
                    page = CopyPdfPageToDoc(document, snapshot.Source);
                    if (PageContainsText(page))
                    {
                        // Since this page already contains text, don't use OCR
                        continue;
                    }
                }
                else
                {
                    page = document.AddPage();
                }

                string tempImageFilePath = Path.Combine(Paths.Temp, Path.GetRandomFileName());

                using (Stream stream = scannedImageRenderer.RenderToStream(snapshot).Result)
                    using (var img = XImage.FromStream(stream))
                    {
                        if (cancelToken.IsCancellationRequested)
                        {
                            break;
                        }

                        if (!importedPdfPassThrough)
                        {
                            DrawImageOnPage(page, img, compat);
                        }

                        if (cancelToken.IsCancellationRequested)
                        {
                            break;
                        }

                        if (!ocrRequestQueue.HasCachedResult(ocrEngine, snapshot, ocrParams))
                        {
                            img.GdiImage.Save(tempImageFilePath);
                        }
                    }

                if (cancelToken.IsCancellationRequested)
                {
                    File.Delete(tempImageFilePath);
                    break;
                }

                // Start OCR
                var ocrTask = ocrRequestQueue.QueueForeground(ocrEngine, snapshot, tempImageFilePath, ocrParams, cancelToken);
                ocrTask.ContinueWith(task =>
                {
                    // This is the best place to put progress reporting
                    // Long-running OCR is done, and drawing text on the page (step 2) is very fast
                    if (!cancelToken.IsCancellationRequested)
                    {
                        Interlocked.Increment(ref progress);
                        progressCallback(progress, snapshots.Count);
                    }
                }, TaskContinuationOptions.ExecuteSynchronously);
                // Record the page and task for step 2
                ocrPairs.Add((page, ocrTask));
            }

            // Step 2: Wait for all the OCR results, and draw the text on each page
            foreach (var(page, ocrTask) in ocrPairs)
            {
                if (cancelToken.IsCancellationRequested)
                {
                    break;
                }
                if (ocrTask.Result == null)
                {
                    continue;
                }
                DrawOcrTextOnPage(page, ocrTask.Result);
            }

            return(!cancelToken.IsCancellationRequested);
        }
Пример #7
0
        private bool BuildDocumentWithoutOcr(ProgressHandler progressCallback, CancellationToken cancelToken, PdfDocument document, PdfCompat compat, ICollection <ScannedImage.Snapshot> snapshots)
        {
            int progress = 0;

            progressCallback(progress, snapshots.Count);
            foreach (var snapshot in snapshots)
            {
                bool importedPdfPassThrough = snapshot.Source.FileFormat == null && !snapshot.TransformList.Any();

                if (importedPdfPassThrough)
                {
                    CopyPdfPageToDoc(document, snapshot.Source);
                }
                else
                {
                    using (Stream stream = scannedImageRenderer.RenderToStream(snapshot).Result)
                        using (var img = XImage.FromStream(stream))
                        {
                            if (cancelToken.IsCancellationRequested)
                            {
                                return(false);
                            }

                            PdfPage page = document.AddPage();
                            DrawImageOnPage(page, img, compat);
                        }
                }
                progress++;
                progressCallback(progress, snapshots.Count);
            }
            return(true);
        }