private void CheckTextWithRecognition(Page Page, int PageImageID) { using (GdPictureImaging Imaging = new GdPictureImaging()) { Imaging.OCRTesseractSetOCRContext(OCRContext.OCRContextSingleBlock); Imaging.OCRTesseractSetPassCount(2); for (int i = 0; i < Page.TextAreas.Count; i++) { var Area = Page.TextAreas[i]; if (!IsEven(Area.Text.Length)) { continue; } var Left = Area.Text.Substring(0, Area.Text.Length / 2); var Right = Area.Text.Substring(Area.Text.Length / 2); if (Left != Right) { continue; } int Width = (int)Area.Width + 2; int Height = (int)Area.Height + 2; int TempId = Imaging.CreateNewGdPictureImage(Width, Height, 1, System.Drawing.Color.Transparent); Imaging.DrawGdPictureImageRect(PageImageID, TempId, 1, 1, Width, Height, (int)Area.Bound.Left, (int)Area.Bound.Top, Width, Height, System.Drawing.Drawing2D.InterpolationMode.NearestNeighbor); //Imaging.SetROI((int)Area.Bound.Left - 10, (int)Area.Bound.Top - 1, (int)Area.Width + 2, (int)Area.Height + 20); string RecognizedValue = Imaging.OCRTesseractDoOCR(TempId, "nld", ATAPY.Common.Application.Path, string.Empty); Imaging.OCRTesseractClear(); //Imaging.SaveAsPNG(TempId, @"d:\Temp\P2P\" + string.Format("Page_{0}_Word_{1}_Cropped.png", Page.Index, i.ToString("D3"))); Imaging.ReleaseGdPictureImage(TempId); //check for words duplication if (RecognizedValue.Length * 2 == Area.Text.Length) { if (Area.Text == RecognizedValue + RecognizedValue) { Area.Text = RecognizedValue; FillCharParams(Area); } } else if (ATAPY.Common.String.MatchRegularExpression(Left, @"^EUR|€|$\d+\s?[.,]?\s?\d*\s?[.,]?\s?\d{2}$")) { Area.Text = Left; FillCharParams(Area); } /*else if (IsEven(Area.Text.Length)) { * var Left = Area.Text.Substring(0, Area.Text.Length / 2); * var Right = Area.Text.Substring(Area.Text.Length / 2); * if (Left == Right && ATAPY.Common.String.MatchRegularExpression(Left, @"^EUR|€|$\d+\s?[.,]?\s?\d*\s?[.,]?\s?\d{2}$")) { * Area.Text = Left; * FillCharParams(Area); * } * }*/ } } }
private void CorrectTextCoordinates(Page Page, int PageImageID) { using (GdPictureImaging Imaging = new GdPictureImaging()) { var Bd = (short)Imaging.GetBitDepth(PageImageID); Dictionary <double, double> AverageNewHeight = new Dictionary <double, double>(); Dictionary <double, int> AverageNewHeightAreas = new Dictionary <double, int>(); for (int i = 0; i < Page.TextAreas.Count; i++) { var Area = Page.TextAreas[i]; int Width = (int)Area.Width + 1; int Height = (int)Area.Height + 1; int TempId = Imaging.CreateNewGdPictureImage(Width, Height, 1, System.Drawing.Color.Transparent); //var ExpectedHeight = Imaging.GetTextHeight(TempId, "W", "Arial", (float)Area.FontSize, GdPicture9.FontStyle.FontStyleRegular); Imaging.DrawGdPictureImageRect(PageImageID, TempId, 0, 0, Width, Height, (int)Area.Bound.Left, (int)Area.Bound.Top, Width, Height, System.Drawing.Drawing2D.InterpolationMode.NearestNeighbor); if (Imaging.GetAverageColor(TempId).Name == "ffffffff") { //transparent or white text printed. if (!AverageNewHeight.ContainsKey(Area.FontSize)) { AverageNewHeight.Add(Area.FontSize, Area.Height); AverageNewHeightAreas.Add(Area.FontSize, 1); } else { AverageNewHeight[Area.FontSize] += Area.Height; AverageNewHeightAreas[Area.FontSize]++; } continue; } bool[] VerticalLines; var LinesSpread = GetAreaLines(TempId, Imaging, Width, Height, out VerticalLines); int TopOffset; int BottomOffset; int LeftOfffset; int RightOffset; bool Is2Dots = (Area.Text == ":" || Area.Text == ";"); /*if (Area.Text == "-") { * Imaging.ReleaseGdPictureImage(TempId); * continue; * }*/ AnalyzeLines(LinesSpread, out TopOffset, out BottomOffset, Is2Dots); AnalyzeVerticalLines(VerticalLines, out LeftOfffset, out RightOffset); /*Imaging.SaveAsPNG(TempId, @"d:\Temp\P2P\" + string.Format("Page_{0}_Word_{1}.png", Page.Index, i.ToString("D3"))); * var stat = Imaging.Crop(TempId, LeftOfffset, TopOffset + 1, Width - LeftOfffset - RightOffset, Height - BottomOffset - TopOffset - 1); * Imaging.SaveAsPNG(TempId, @"d:\Temp\P2P\" + string.Format("Page_{0}_Word_{1}_Cropped.png", Page.Index, i.ToString("D3")));*/ /*if (Area.Width - LeftOfffset - RightOffset <= 0) { * //var res = Imaging.SaveAsPNG(TempId, @"d:\Test.png"); * int a = 1; * } * if (Area.Height - TopOffset - BottomOffset <= 0) { * int b = 1; * }*/ Imaging.ReleaseGdPictureImage(TempId); Area.Bound = new Rect(Area.Bound.Left + LeftOfffset, Area.Bound.Top + TopOffset, Area.Width - LeftOfffset - RightOffset, Area.Height - TopOffset - BottomOffset); //FillCharParams(Area); if (!AverageNewHeight.ContainsKey(Area.FontSize)) { AverageNewHeight.Add(Area.FontSize, Area.Height - TopOffset - BottomOffset); AverageNewHeightAreas.Add(Area.FontSize, 1); } else { AverageNewHeight[Area.FontSize] += Area.Height - TopOffset - BottomOffset; AverageNewHeightAreas[Area.FontSize]++; } } Dictionary <double, double> AverageNewFontSize = new Dictionary <double, double>(); foreach (var item in AverageNewHeight) { AverageNewFontSize.Add(item.Key, item.Value / AverageNewHeightAreas[item.Key]); } foreach (var Area in Page.TextAreas) { Area.FontSize = AverageNewFontSize[Area.FontSize]; if (Area.Text == "-") { int TempId = Imaging.CreateNewGdPictureImage((int)Area.Width, (int)Area.Width, 1, System.Drawing.Color.Transparent); var ExpectedHeight = Imaging.GetTextHeight(TempId, "W", "Arial", (float)Area.FontSize, GdPicture9.FontStyle.FontStyleRegular); int Offset = (int)((ExpectedHeight - Area.Height) / 2.0) + 1; Imaging.ReleaseGdPictureImage(TempId); Area.Bound = new Rect(Area.Bound.Left, Area.Bound.Top - Offset, Area.Width, Area.Height + 2 * Offset); } FillCharParams(Area); } } }