private void GetOmrReading(IOcrPage ocrPage, FormField field, ImageField imageField, int retry = 1) { IOcrPageCharacters pageCharacters = ocrPage.GetRecognizedCharacters(); if (pageCharacters == null) { logger.Warn($"could not read OMR for ${field} "); imageField.FieldResult.Confidence = 0; imageField.FieldResult.Text = ""; } else { IOcrZoneCharacters zoneCharacters = pageCharacters[0]; if (zoneCharacters.Count > 0) { OcrCharacter omrCharacter = zoneCharacters[0]; imageField.FieldResult.Text = omrCharacter.Code.ToString(); imageField.FieldResult.IsFilled = omrCharacter.Code == FilledChar; imageField.FieldResult.Confidence = omrCharacter.Confidence; // often on a fill we get the line from the box, so we retry more narrowly if (imageField.FieldResult.IsFilled) { if (retry > 0) { var orgZone = ocrPage.Zones[0]; orgZone.Bounds = ChangeBoundsRatio(orgZone.Bounds, 0.66); ocrPage.Recognize(null); GetOmrReading(ocrPage, field, imageField, 0); logger.Info($"FILLED {field.Name}"); } } } else { imageField.FieldResult.Text = ""; } } }
public List <FilledForm> ProcessOcr(ResultsForPrettyJson formResults, List <ImageInfo> fileInfos) { try { var outDir = formResults.OriginalDirectoryName; var retForms = new List <FilledForm>(); var usedMasters = new HashSet <MasterForm>(); Stopwatch stopWatch = new Stopwatch(); stopWatch.Start(); formResults.PagesInPdf = fileInfos.Count; foreach (var ofi in fileInfos) { FilledForm newForm = new FilledForm(); retForms.Add(newForm); newForm.ImageInfoMaster.InitialImage = ofi; newForm.Name = Path.GetFileNameWithoutExtension(ofi.ImageFileInfo.Name); if (ofi.Image == null) { ofi.Image = LoadImageFile(ofi.ImageFileInfo.FullName, 1, -1); } //CleanupImage(ofi.Image); var par = new FormThreadCallParams() { ImageInfo = ofi, StopWatch = stopWatch, Form = newForm }; if (PageTimeoutInSeconds < 50) { Thread t = new Thread(this.PrepareNewFormThreader); t.Start(par); if (!t.Join(TimeSpan.FromSeconds(PageTimeoutInSeconds))) { t.Abort(); formResults.TimedOutPages.Add(newForm.Name); formResults.BestFormConfidence.Add(-1); if (formResults.TimedOutPages.Count > 2 && formResults.PagesMappedToForm == 0) { formResults.Status = $"Form abandoned for timeout after {formResults.BestFormConfidence.Count} pages"; logger.Error(formResults.Status); return(retForms); } continue; } } else { PrepareNewFormThreader(par); } Debug.Assert(par.Attributes != null); var filledFormAttributes = par.Attributes; //List<FormRecognitionResult> results = new List<FormRecognitionResult>(); MasterForm currentMasterBlockForm = null; int bestConfidence = -1; int currentConfidence = 85; foreach (var master in BlockMasterForms) { if (usedMasters.Contains(master)) { continue; } var result = RecognitionEngine.CompareForm(master.Attributes, filledFormAttributes, null, null); //logger.Debug($"Check {master} for {newForm} {stopWatch.ElapsedMilliseconds} {result.Confidence}"); if (result.Confidence > currentConfidence) { currentMasterBlockForm = master; bestConfidence = currentConfidence = result.Confidence; } else if (result.Confidence > bestConfidence) { bestConfidence = result.Confidence; } } formResults.BestFormConfidence.Add(bestConfidence); if (currentMasterBlockForm != null) { formResults.MasterFormPages.Add(currentMasterBlockForm.Properties.Name); formResults.PagesMappedToForm++; logger.Info($"FilledForm matched {newForm.Name} {newForm.Status} {stopWatch.ElapsedMilliseconds} "); newForm.ImageInfoMaster.InitialImage = ofi; var centeredImage = ofi.Image.CloneAll(); CleanupImage(centeredImage); newForm.ImageInfoMaster.CenteredImage = new ImageInfo() { Image = centeredImage }; var omrImage = centeredImage.CloneAll(); PrepareOmrImage(omrImage); newForm.ImageInfoMaster.OmrImage = new ImageInfo() { Image = omrImage }; newForm.Status = "Matched"; newForm.Master = currentMasterBlockForm; var alignment = RecognitionEngine.GetFormAlignment(newForm.Master.Attributes, newForm.Attributes, null); var fields = currentMasterBlockForm.ProcessingPages[0]; var scaler = currentMasterBlockForm.Resolution; var fieldsOnlyImage = RasterImage.Create(centeredImage.Width, centeredImage.Height, centeredImage.BitsPerPixel, 300, RasterColor.White); //fieldsOnlyImage = new RasterImage(RasterMemoryFlags.Conventional, centeredImage.Width, centeredImage.Height, centeredInage.BitsPerPixel, RasterByteOrder.Rgb, RasterViewPerspective.TopLeft, null, null, 0); var subDirField = Path.Combine(outDir, "fields"); var fileNameFieldOnly = Path.Combine(subDirField, newForm.Name + "_fields.jpg"); var googleResultsFile = Path.Combine(subDirField, newForm.Name + "_google.json"); var combined = false; foreach (var field in fields) { var isBlock = field.Name.Contains("block"); var rect200 = alignment[0].AlignRectangle(field.Bounds); scaler = 300; int fudge = isBlock ? 30 : 1; var rect300 = new LeadRect(rect200.Left * 300 / scaler - fudge, rect200.Top * 300 / scaler - fudge, rect200.Width * 300 / scaler + fudge, rect200.Height * 300 / scaler + fudge); try { var imageInfoToUse = newForm.ImageInfoMaster.CenteredImage; var zoneType = OcrZoneType.Text; if (field.GetType() == typeof(OmrFormField)) { imageInfoToUse = newForm.ImageInfoMaster.OmrImage; zoneType = OcrZoneType.Omr; } else if (field.GetType() == typeof(ImageFormField)) { zoneType = OcrZoneType.Graphic; } var image = imageInfoToUse.Image.CloneAll(); var subDir = Path.Combine(outDir, isBlock ? "blocks" : "fields"); var fileName = Path.Combine(subDir, newForm.Name + "_" + field.Name + ".jpg"); var imageField = new ImageField { Field = field, FieldResult = { FieldName = field.Name, IsBlock = isBlock, ImageFile = fileName, Bounds = rect300.ToString(), FieldType = zoneType.ToString(), Error = "None" } }; imageField.Rectangle = new Rectangle(rect300.X, rect300.Y, rect300.Width, rect300.Height); try { EnsurePathExists(subDir); CropCommand command = new CropCommand { Rectangle = rect300 }; command.Run(image); RasterCodecs.Save(image, fileName, RasterImageFormat.Jpeg, bitsPerPixel: 8); if (!isBlock && zoneType == OcrZoneType.Text && !combined) { try { ; var combiner = new CombineCommand(); //combiner.DestinationImage = fieldsOnlyImage; combiner.SourceImage = image.Clone(); combiner.DestinationRectangle = rect300; var regionBounds = image.GetRegionBounds(null); combiner.SourcePoint = new LeadPoint(regionBounds.X, regionBounds.Y); //combiner.Flags = CombineCommandFlags.OperationAdd | CombineCommandFlags.Destination0 | CombineCommandFlags.Source1 | CombineCommandFlags.Destination0 ; combiner.Flags = CombineCommandFlags.OperationOr | CombineCommandFlags.Destination0;; // |CombineFastCommandFlags.OperationAverage; combiner.Run(fieldsOnlyImage); //combined = true; } catch (Exception exCombine) { logger.Error(exCombine, $"error combining field {field.Name} {rect300}"); } } var imageInfo = new ImageInfo() { Image = image, ImageFileInfo = new FileInfo(fileName) }; imageField.ImageInfo = imageInfo; if (!isBlock && zoneType != OcrZoneType.Graphic) { using (IOcrPage ocrPage = OcrEngine.CreatePage(image, OcrImageSharingMode.AutoDispose)) { OcrZone ocrZone = new OcrZone { ZoneType = zoneType, Bounds = new LeadRect(fudge, fudge, image.ImageSize.Width - fudge, image.ImageSize.Height - fudge) }; ocrPage.Zones.Add(ocrZone); ocrPage.Recognize(null); if (zoneType == OcrZoneType.Omr) { if (field.Name.Contains("C2NGVD1929")) { logger.Info(ocrZone.Bounds); } GetOmrReading(ocrPage, field, imageField); } else if (zoneType == OcrZoneType.Text) { var resultsPage = GetPageConfidence(ocrPage); imageField.FieldResult.Confidence = resultsPage.Confidence; char[] crlf = { '\r', '\n' }; imageField.FieldResult.Text = ocrPage.GetText(0).TrimEnd(crlf); } } } logger.Info( $"field {field.Name} {rect300} [{imageField.FieldResult.Text}] confidence: {imageField.FieldResult.Confidence}"); } catch (Exception exField) { logger.Error(exField, $"Error processing {field.Name}"); formResults.FieldsWithError++; imageField.FieldResult.Error = exField.Message; } newForm.ImageFields.Add(imageField); formResults.OcrFields.Add(imageField.FieldResult); formResults.Status = "FormMatched"; } catch (Exception ex) { logger.Error(ex, $"Error on field {field.Name} {rect300}"); newForm.Status = $"Error|Field {field.Name} {rect300}: [{ex.Message}]"; } } RasterCodecs.Save(PrepareOmrImage(fieldsOnlyImage), fileNameFieldOnly, RasterImageFormat.Jpeg, bitsPerPixel: 8); var googleResults = GoogleOcr(fileNameFieldOnly); if (googleResults.Count > 0) { var json = JsonConvert.SerializeObject(googleResults, Formatting.Indented); File.WriteAllText(googleResultsFile, json); MergeGoogleOcr(newForm, googleResults); } usedMasters.Add(currentMasterBlockForm); } else { newForm.Status = "Unmatched|No MasterForm match"; } logger.Info($"FilledForm processed {newForm.Name} {newForm.Status} {stopWatch.ElapsedMilliseconds} "); if (usedMasters.Count == BlockMasterForms.Count) { logger.Info("found all master forms"); break; } } stopWatch.Stop(); return(retForms); } catch (Exception ex) { logger.Error(ex, "Untrapped error found"); return(null); } }