Exemple #1
0
    /// <summary>
    /// Uses parallel processing to perform OCR on the mailing address region of the pdf files.
    /// Calls other methods to update the pdf files with that OCR information and builds a
    /// dictionary for file paths with associated page counts.
    /// </summary>
    /// <param name="inputFiles">List of files to be processed.</param>
    /// <param name="currentFolder">Active input directory.</param>
    /// <returns>Dictionary of file paths and associated page counts.</returns>
    internal static Dictionary <string, int> Process(List <string> inputFiles, KeyValuePair <string, string> currentFolder)
    {
        var filePageCounts = new ConcurrentDictionary <string, int>();

        SetupOcrWorkingDirectory();

        Parallel.ForEach(inputFiles, file =>
        {
            string returnedText = null;
            using (var document = new PDFDocument(file))
            {
                filePageCounts.TryAdd(file.ToString(), document.Pages.Count);

                using (IOcrEngine ocrEngine = OcrEngineManager.CreateEngine(OcrEngineType.Advantage, false))
                {
                    ocrEngine.Startup(null, null, ocrWorkingDir, ocrAdvantageRuntimeDir);
                    ocrEngine.SpellCheckManager.SpellCheckEngine = OcrSpellCheckEngine.None;
                    returnedText = GetAddressBlockText(ocrEngine, document);
                    ocrEngine.Shutdown();
                }
            }
            PdfUtility.OverlayOcrText(returnedText, file, currentFolder);
        }
                         );
        var returnDictionary = filePageCounts.ToDictionary(kvp => kvp.Key,
                                                           kvp => kvp.Value);

        return(returnDictionary);
    }