public List <FilledForm> ProcessOcr(ResultsForPrettyJson formResults, List <ImageInfo> fileInfos) { try { var outDir = formResults.OriginalDirectoryName; var retForms = new List <FilledForm>(); var usedMasters = new HashSet <MasterForm>(); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); formResults.PagesInPdf = fileInfos.Count; foreach (var ofi in fileInfos) { FilledForm newForm = new FilledForm(); retForms.Add(newForm); newForm.ImageInfoMaster.InitialImage = ofi; newForm.Name = Path.GetFileNameWithoutExtension(ofi.ImageFileInfo.Name); if (ofi.Image == null) { ofi.Image = LoadImageFile(ofi.ImageFileInfo.FullName, 1, -1); } //CleanupImage(ofi.Image); var par = new FormThreadCallParams() { ImageInfo = ofi, StopWatch = stopWatch, Form = newForm }; if (PageTimeoutInSeconds < 50) { Thread t = new Thread(this.PrepareNewFormThreader); t.Start(par); if (!t.Join(TimeSpan.FromSeconds(PageTimeoutInSeconds))) { t.Abort(); formResults.TimedOutPages.Add(newForm.Name); formResults.BestFormConfidence.Add(-1); if (formResults.TimedOutPages.Count > 2 && formResults.PagesMappedToForm == 0) { formResults.Status = $"Form abandoned for timeout after {formResults.BestFormConfidence.Count} pages"; logger.Error(formResults.Status); return(retForms); } continue; } } else { PrepareNewFormThreader(par); } Debug.Assert(par.Attributes != null); var filledFormAttributes = par.Attributes; //List<FormRecognitionResult> results = new List<FormRecognitionResult>(); MasterForm currentMasterBlockForm = null; int bestConfidence = -1; int currentConfidence = 85; foreach (var master in BlockMasterForms) { if (usedMasters.Contains(master)) { continue; } var result = RecognitionEngine.CompareForm(master.Attributes, filledFormAttributes, null, null); //logger.Debug($"Check {master} for {newForm} {stopWatch.ElapsedMilliseconds} {result.Confidence}"); if (result.Confidence > currentConfidence) { currentMasterBlockForm = master; bestConfidence = currentConfidence = result.Confidence; } else if (result.Confidence > bestConfidence) { bestConfidence = result.Confidence; } } formResults.BestFormConfidence.Add(bestConfidence); if (currentMasterBlockForm != null) { formResults.MasterFormPages.Add(currentMasterBlockForm.Properties.Name); formResults.PagesMappedToForm++; logger.Info($"FilledForm matched {newForm.Name} {newForm.Status} {stopWatch.ElapsedMilliseconds} "); newForm.ImageInfoMaster.InitialImage = ofi; var centeredImage = ofi.Image.CloneAll(); CleanupImage(centeredImage); newForm.ImageInfoMaster.CenteredImage = new ImageInfo() { Image = centeredImage }; var omrImage = centeredImage.CloneAll(); PrepareOmrImage(omrImage); newForm.ImageInfoMaster.OmrImage = new ImageInfo() { Image = omrImage }; newForm.Status = "Matched"; newForm.Master = currentMasterBlockForm; var alignment = RecognitionEngine.GetFormAlignment(newForm.Master.Attributes, newForm.Attributes, null); var fields = currentMasterBlockForm.ProcessingPages[0]; var scaler = currentMasterBlockForm.Resolution; var fieldsOnlyImage = RasterImage.Create(centeredImage.Width, centeredImage.Height, centeredImage.BitsPerPixel, 300, RasterColor.White); //fieldsOnlyImage = new RasterImage(RasterMemoryFlags.Conventional, centeredImage.Width, centeredImage.Height, centeredInage.BitsPerPixel, RasterByteOrder.Rgb, RasterViewPerspective.TopLeft, null, null, 0); var subDirField = Path.Combine(outDir, "fields"); var fileNameFieldOnly = Path.Combine(subDirField, newForm.Name + "_fields.jpg"); var googleResultsFile = Path.Combine(subDirField, newForm.Name + "_google.json"); var combined = false; foreach (var field in fields) { var isBlock = field.Name.Contains("block"); var rect200 = alignment[0].AlignRectangle(field.Bounds); scaler = 300; int fudge = isBlock ? 30 : 1; var rect300 = new LeadRect(rect200.Left * 300 / scaler - fudge, rect200.Top * 300 / scaler - fudge, rect200.Width * 300 / scaler + fudge, rect200.Height * 300 / scaler + fudge); try { var imageInfoToUse = newForm.ImageInfoMaster.CenteredImage; var zoneType = OcrZoneType.Text; if (field.GetType() == typeof(OmrFormField)) { imageInfoToUse = newForm.ImageInfoMaster.OmrImage; zoneType = OcrZoneType.Omr; } else if (field.GetType() == typeof(ImageFormField)) { zoneType = OcrZoneType.Graphic; } var image = imageInfoToUse.Image.CloneAll(); var subDir = Path.Combine(outDir, isBlock ? "blocks" : "fields"); var fileName = Path.Combine(subDir, newForm.Name + "_" + field.Name + ".jpg"); var imageField = new ImageField { Field = field, FieldResult = { FieldName = field.Name, IsBlock = isBlock, ImageFile = fileName, Bounds = rect300.ToString(), FieldType = zoneType.ToString(), Error = "None" } }; imageField.Rectangle = new Rectangle(rect300.X, rect300.Y, rect300.Width, rect300.Height); try { EnsurePathExists(subDir); CropCommand command = new CropCommand { Rectangle = rect300 }; command.Run(image); RasterCodecs.Save(image, fileName, RasterImageFormat.Jpeg, bitsPerPixel: 8); if (!isBlock && zoneType == OcrZoneType.Text && !combined) { try { ; var combiner = new CombineCommand(); //combiner.DestinationImage = fieldsOnlyImage; combiner.SourceImage = image.Clone(); combiner.DestinationRectangle = rect300; var regionBounds = image.GetRegionBounds(null); combiner.SourcePoint = new LeadPoint(regionBounds.X, regionBounds.Y); //combiner.Flags = CombineCommandFlags.OperationAdd | CombineCommandFlags.Destination0 | CombineCommandFlags.Source1 | CombineCommandFlags.Destination0 ; combiner.Flags = CombineCommandFlags.OperationOr | CombineCommandFlags.Destination0;; // |CombineFastCommandFlags.OperationAverage; combiner.Run(fieldsOnlyImage); //combined = true; } catch (Exception exCombine) { logger.Error(exCombine, $"error combining field {field.Name} {rect300}"); } } var imageInfo = new ImageInfo() { Image = image, ImageFileInfo = new FileInfo(fileName) }; imageField.ImageInfo = imageInfo; if (!isBlock && zoneType != OcrZoneType.Graphic) { using (IOcrPage ocrPage = OcrEngine.CreatePage(image, OcrImageSharingMode.AutoDispose)) { OcrZone ocrZone = new OcrZone { ZoneType = zoneType, Bounds = new LeadRect(fudge, fudge, image.ImageSize.Width - fudge, image.ImageSize.Height - fudge) }; ocrPage.Zones.Add(ocrZone); ocrPage.Recognize(null); if (zoneType == OcrZoneType.Omr) { if (field.Name.Contains("C2NGVD1929")) { logger.Info(ocrZone.Bounds); } GetOmrReading(ocrPage, field, imageField); } else if (zoneType == OcrZoneType.Text) { var resultsPage = GetPageConfidence(ocrPage); imageField.FieldResult.Confidence = resultsPage.Confidence; char[] crlf = { '\r', '\n' }; imageField.FieldResult.Text = ocrPage.GetText(0).TrimEnd(crlf); } } } logger.Info( $"field {field.Name} {rect300} [{imageField.FieldResult.Text}] confidence: {imageField.FieldResult.Confidence}"); } catch (Exception exField) { logger.Error(exField, $"Error processing {field.Name}"); formResults.FieldsWithError++; imageField.FieldResult.Error = exField.Message; } newForm.ImageFields.Add(imageField); formResults.OcrFields.Add(imageField.FieldResult); formResults.Status = "FormMatched"; } catch (Exception ex) { logger.Error(ex, $"Error on field {field.Name} {rect300}"); newForm.Status = $"Error|Field {field.Name} {rect300}: [{ex.Message}]"; } } RasterCodecs.Save(PrepareOmrImage(fieldsOnlyImage), fileNameFieldOnly, RasterImageFormat.Jpeg, bitsPerPixel: 8); var googleResults = GoogleOcr(fileNameFieldOnly); if (googleResults.Count > 0) { var json = JsonConvert.SerializeObject(googleResults, Formatting.Indented); File.WriteAllText(googleResultsFile, json); MergeGoogleOcr(newForm, googleResults); } usedMasters.Add(currentMasterBlockForm); } else { newForm.Status = "Unmatched|No MasterForm match"; } logger.Info($"FilledForm processed {newForm.Name} {newForm.Status} {stopWatch.ElapsedMilliseconds} "); if (usedMasters.Count == BlockMasterForms.Count) { logger.Info("found all master forms"); break; } } stopWatch.Stop(); return(retForms); } catch (Exception ex) { logger.Error(ex, "Untrapped error found"); return(null); } }
// <summary> // LEADOcr // </summary> // <param name="path">Either a single PDF or a director</param> // <param name="timeout">Time a single page can run</param> // <param name="wildcard">Windows file wildcard</param> // <param name="endLevel">Levels are 1: extract image. 2: detect form, 3:</param> static void Main(string path = CertificateDirString, int timeout = 15, string wildcard = "*.pdf", int endLevel = 3) { //WriteS3().Wait(); int hadForms = 0; int noForms = 0; int hadErrors = 0; var stopWatch = new Stopwatch(); var stopWatchBig = new Stopwatch(); stopWatch.Start(); stopWatchBig.Start(); logger.Info($"path={path}, endLevel={endLevel}"); try { string licString = @"[License]\nLicense = <doc><ver>2.0</ver><code>dRPeSE6yUwC1MzsUmenmelx0u+4MESU1NqhbVFYqFPTlz6k/Yug6OIJ2uE0sgHxZQGkbZ6EQ9ezoacjLk3BgHSQN4468UWwkkHct3QSz+1aO40nETtw9xbEGnV1yLZE/bWzYW5i6RVK9poFDo47cljYnNG+Z055NVPhTaVKkaaJrDdy7m+pgPyPzxxjcUSra21CpJcIWb459CUXJxR2Ey3HCV+qKMU4gj7QGyoMzeyneHpojQQqYGUAEMA/LjsKL0gjRtZXTxl4cKSQ+r1gc3oulFEBuTTPl9/mZPl8ijC8/wCg04/V95NhgUZ/gXRE4Wf8kZhd2DdXCs9DnQ6W9TJkfMOgQw/BoOUe/buPCIFv07K2fA0tiArmaUVyAUzDEPTYUVw9f1fZcv8EA8QH2l8tuOwLbw5LvdZPre9TeK24cBdOhcTg5qeV5XHvpSBcfYi9cq5dy2wGA9ASQtwmRL1g4FTMiY372lzorjm3VRpO1ZYSOgl8FvyZn+iUmT8hlcxzpxQQqAw7B6b7eyYGmU+/HPwzKoAb5d3dsyKo88oR4hZHcj5ciRHRYT8ETUL+g4HuSGtpttqGW2H1OL34QtsmNovh7chzWFdJkL5IQn0dU2EyLdpaUzJ3e3kNDEDIv0qJbj91hU5d6eq0zWA+ZmsHxvJHKcJoD/cviUcVyUxFogxMTKxTTdKG5/HOA7qK5YF8bAnyDbseiIzMXJm2vPEl4uDKlPlqXOdjFTBWqanMQ15XP4NVrXl2oI+jjoNW5ZCfMqsijfOXsXK4HMUVoLYHkqEEjI+cCnMoUGUHQ+uPtzgH52aqLxtvER8uxcN+c9Cp5y8nHqvarCYrz4btZf1mkQwq0tKYyBluEtCa8q1tWEzFENBfm0L9KQDkyDxWrNGwgXomBZ0OqdOE7TbZIhVN1+kZFCmpHEMnkKpA4LEztO3jL5bQqvLeLwrt8spPTEid8SRT2E37bled15HmJP64KMyx6Wx8P0D541M1tPRR36Q0jFbbp70N3N4TKE3Yx35+COL+ts91AjMjQ+31r7cQAtsjBq9UDXDmadO0XyMqVWKSNOMZzIA4+TZ0seQ+pf5eYtWLXBfhvimwB86XuelAFmPu/mKmykDiKixNFRc1wXhh8U54W8gatUdpddpF5FmCR6VsT/43q6Y7yswfhcy6G6mysuFouoTBEf/KgnhK4hMgZcj2l8c08AHRVG/qui3dkhe+yRz2eOFdr6U24ArPhiXKg709rhYGPqLxezZDdocowEQ1bzT1pAcEQxtcNOVt0yMr5deepxd9k6ILH/aJw2acCL905JG9jIVq1b2PlxsVIDjEq7Czx8wd+iLO/gDzIns4EQ1p0tHnLO/nOrCgmjSPfw62euWF1HBdmCiPhSYuWJ0cuqZFueL3ehW+0TV5RjHyVwXLMeEWioMjk0jkXGRho7vM4W1gJ6jMZI4zqTIL0Sv2vkgHhqCzIm0cR5IUBK7CIPHQbvLW8lF5JKmeMRQlMSCpr7KnmsGIrhYKmlTjcjlcOj2FQrcXhMMofvxa1YRO3QRC7IV9fKko4AgJGnARSijVvVk1zWw1za9p+xYhqp7Z/Xr54KKqMm8VLSKn3MykhsHSthEPodeSFXh5uVO6sdOrhaySLscys9555s0zxUcAFRI6IZVKHBfMRdGzfN2o4ZQ65Lv4FfH7e7Rs3Hf8HT+gGxyKdfYn7dpzy9uw4fF3hCOwACAxQQelXmFnIutQufqNfG/fpmnePqamwMf1WyJY7WTjkuB/SQrt6UBA/FMJvU49HMl1FmkbvbWIHTbCxVZ92l9FJLQqR/3iKb1ZBOXut5+iQpLCkOPk1G+zD2TRIG2mZydVOw35MgvhIMKhUrDNegcN8Qw7pZVBDybqxNtXZRBWNtmA+jTdN1eN9s/0a89NcizfvcKTtPMZgIXayFk9PfabZdEmD/lFjxm/iP1kIIrMoikPEp/cgixo+N9v2y6b7Wch/3z6E6viH1fv2WvKU0dhI6dhKeyYNm5kBiA93MX+1uR+TGynSVYSZMscDW/bnsUzK+1gsc+RyrDZH+tfDxIhphwBh81UvpQcA+ZT9y4YJBzD2ixCSnheE19WXELMiR9ILGg2kHbydQyapIBnr6VaOTzL8fk29dIYzIzukROyxvY38teZ6+asxszxmwJPG8q0hHnsX+uplkS8vf+F+X/3+/5FNRcn/nKairfsa04PnMF4Y3nGlD20inkqvT7903qFe3bXVQBOIbrlhHrDQvALfbE5DOgbqlLNW+MONCKYI4MNDpA8kme8k2T5WqVvyDgHmIocHVrhCOGDFHMGuYVrzB+JVkimSp/Y1C4+hFrhcc98zU9SFYU4GAB9Cka1wJBGiDuFQ813zgNwEQ9+fV44k9GBXctWtITo3kXAj4zilMhBdwqeNdVxypLOmbxwI/RcsiW2oEF0+E26+GNNBN3AI5pfdy4nVQQtm9d8IbnEDKbGusM1niABLr+5QqFVtmxiYrGVeqehGXhuz9PXGRmtVDnwDZqLJtovkIJEjpJEXoeUPpjpdPDzX2cd7tLLDtsNMQ452vb1Q9WoONuI6FqD71qVXOZbh018QO7+kK+nWqxA+ODBUaT6Fh7VmtWOG44+z9liFR1/AvObrU7FT8U/F4FYS16Eh2cun5yPUibuk209homLlacLHbQiShrdAdNLqZUSTxYC1wZP0iDbw2YUtRqsLSbxDKY8wbdmzXry+yBCqEBj9E5vC1wI84K2JzSqkVci8KTzHC9KekKkwZPvz+vxQEGaVOC6L+LnZYQ7Lord1EWoKd7fS/PXwesSupWMXz+cNusCs5BH0mt55IcYv9YW8uNGxYomvbglRoEkkntatrcPmnQlV9iYrYJAxlfO83WwwHstnuUiHT46MXI18SDOyG8OM8GtOVxX4EiZcuukYhCaB/l+aMs60Wy64uE1ft1NrdBm2erj+WpJwFrcbLkCUYoeRLf9o4TtEzSV6ZOcFOcfdntC2lp9gVURLQV3xlv2uQsrs6lSWML4VGMvpP3GccqPpIanB0VTVOuQrBmqZmRbl2qRpwFuIsnbCVvbby6+IKybwH5py3eQPv7Tsif4Zv3bKF4Cbm7RCBhANalpMvYKTDKnXqF3BaYa9+whdagKIjF8o3Rk3Uwk7UnM16HYyJIDyoz0Fq7BBJSep7evVxgOwpQU1AVErMVsIgZqQaoLj6CHFS36WbVXX+qNNgONSt1BYBXhox1GPEuFSDn5aKuVrhITZPu10mT5xLtk6lBlGRBNE0KoiWeOVT2ovlyf+2I1hXpGQNsgoCvdNYWTWbyywvJuE7lSm5B1mpIwaE+E6Jr6POVkNXl/wRLmtfusns+P3tPxuQqYweOR0lStVV/y0B9UdwVUSNxpV/4e3BPnUMVVFpnQmg+CEsvPAdMGPkLEzyup3Za3r6paNZLMKs0MoEIH5EwxVed5q8bCudwsE4DTjasaQZmUm5PoZhTViTSSmTirPHXwT762wlJ4z3tF3YwnFGD+KQ9Ga4GP2W7BdRmfbSFL9LuLRyjiJGMc4fuAAnEMLQqqil3RbvMzgnLoAjgGogPp53XdbBMGiiXkGwFPqk6TdupbfcJCKQEty8minkzpw2213lzZbrecbhhIpUNzkJxWbbjOypAItmroNOAwHFGxgrKxVGQHbJ/BHd6AEWt4GSBzVAPJyD8FIqDKwQWwGkLQwaS9/1/SUYOJkla2/s+Et97R711+MyZthJbKW8w24KmN2o3nblTEtYmcDTAbsJ5NHhOTZiPeJ43mjIEaYhiZgLvxVHGBEb+DTFpAcH3mKWoRAKF6EN70CkY+AQz/BuY1JKuG8/WllHj6MTNxZTYTaCfLHbq4JQUP/Ofz/Urtg6krx7x0r+m7xzr1w5I3G/RW2DHP2ijFPybdDQaVl7MR7lqymNtz2yheN4Ec3okOGRGd7OhOeJ/Q7M0Stoq1wBWh7ciAuTjWvTH+ErfgWEudIYVWNJJQE0DbgetAml1Ga1katbsvJZB2lbI6RdSNvEC8eyXJ5lfmRDgyX+HXOB350WzXSTSjaKiClRezfbwtlipOhXoMqwOACB1eMXUUkJNPfJX+bmofdwpCjYHib1iMHAa80X4jQwEZeRVHLwo1C9Ou/B10HxZRAP/JOgR/1rkPCGkrurrMy8956utdBujSAIes3L3jaafwMovFoY5WWvant6TtQnHUrmpxyfyRvacAQjdEcv+uJomdUuQ26HgAjzmPC3d9jssFox/8JJp2Em/xbWkmep/Ldxa3BvJpYi9CPwKCIzPykoQm4cr825+6zUqXH4akB4QEfiopmQvqzfEoaM3yo+RnsaQHbdz5p7fo0wEcxCM+OxZQHRBGmrmccu+EsYD9Jh/8qHKQs4mpJW0LprlgkHc2iKydJUStXSiHVrHo98THsVm5sT1hRQp/lRihTlfg37qZ1KbCTFhqELGa3pM7mCAG/Liu+sg==</code></doc>"; byte[] licBytes = System.Text.Encoding.UTF8.GetBytes(licString); string key = @"i8xgXvVTrpbjbRHDPdFZk9+RWcBrLjUIlt233v5p4TOpoJPYBGOG1xqYtXqhCnFE"; //"PASTE YOUR DEVELOPER KEY HERE"; RasterSupport.SetLicense(licBytes, key); } catch (Exception ex) { logger.Error(ex.Message); } if (RasterSupport.KernelExpired) { throw new Exception("Invalid license"); } //TestOcr2(); var dataDirectory = System.IO.Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location); logger.Info($"dir = {dataDirectory}"); var filesToDo = new List <string>(); if (path != null) { if (File.Exists(path)) { filesToDo.Add(path); } else { if (Directory.Exists(path)) { foreach (var file in Directory.GetFiles(path, wildcard)) { var fi = new FileInfo(file); if (fi.Extension == ".pdf" || fi.Extension == ".tif") { filesToDo.Add(file); } } } else { logger.Error($"invalid pdf/dir 'path' parameter: {path}"); return; } } } if (filesToDo.Count > 0) { var ocrMaster = new OcrMaster() { PageTimeoutInSeconds = timeout }; foreach (var pdfFile in filesToDo) { try { if (filesToDo.Count > 1) { stopWatch.Restart(); } var fi = new FileInfo(pdfFile); var stem = Path.GetFileNameWithoutExtension(pdfFile); var dirTiff = Path.Combine(fi.DirectoryName, stem); Directory.CreateDirectory(dirTiff); logger.Info( "---------------------------------------------------------------------------------------"); logger.Info(fi.Name); logger.Info( "---------------------------------------------------------------------------------------"); var outFileTemplate = Path.Combine(dirTiff, stem + "_{page}.png"); logger.Info(outFileTemplate); var pngFiles = ConvertDocumentToImage(pdfFile, outFileTemplate, RasterImageFormat.Png, 8, null); var formResults = new ResultsForPrettyJson() { PdfFileName = fi.Name, OriginalDirectoryName = dirTiff, }; ocrMaster.ProcessOcr(formResults, pngFiles); var baseName = Path.GetFileNameWithoutExtension(formResults.PdfFileName); var jsonName = Path.Combine(dirTiff, baseName + ".json"); formResults.ElapsedMilliseconds = stopWatch.ElapsedMilliseconds; var json = JsonConvert.SerializeObject(formResults, Formatting.Indented); File.WriteAllText(jsonName, json); logger.Info($"Writing to {jsonName}, {stopWatch.ElapsedMilliseconds} milliseconds, {stopWatchBig.Elapsed}"); if (formResults.PagesMappedToForm > 0) { hadForms++; if (formResults.FieldsWithError > 0) { hadErrors++; } } else { noForms++; } } catch (Exception e) { logger.Error(e, $"File {pdfFile}"); } } } logger.Info( $"Completed f/e/n:{hadForms}/{hadErrors}/{noForms} in {stopWatchBig.Elapsed}"); }