private void CheckTextWithRecognition(Page Page, int PageImageID) { using (GdPictureImaging Imaging = new GdPictureImaging()) { Imaging.OCRTesseractSetOCRContext(OCRContext.OCRContextSingleBlock); Imaging.OCRTesseractSetPassCount(2); for (int i = 0; i < Page.TextAreas.Count; i++) { var Area = Page.TextAreas[i]; if (!IsEven(Area.Text.Length)) { continue; } var Left = Area.Text.Substring(0, Area.Text.Length / 2); var Right = Area.Text.Substring(Area.Text.Length / 2); if (Left != Right) { continue; } int Width = (int)Area.Width + 2; int Height = (int)Area.Height + 2; int TempId = Imaging.CreateNewGdPictureImage(Width, Height, 1, System.Drawing.Color.Transparent); Imaging.DrawGdPictureImageRect(PageImageID, TempId, 1, 1, Width, Height, (int)Area.Bound.Left, (int)Area.Bound.Top, Width, Height, System.Drawing.Drawing2D.InterpolationMode.NearestNeighbor); //Imaging.SetROI((int)Area.Bound.Left - 10, (int)Area.Bound.Top - 1, (int)Area.Width + 2, (int)Area.Height + 20); string RecognizedValue = Imaging.OCRTesseractDoOCR(TempId, "nld", ATAPY.Common.Application.Path, string.Empty); Imaging.OCRTesseractClear(); //Imaging.SaveAsPNG(TempId, @"d:\Temp\P2P\" + string.Format("Page_{0}_Word_{1}_Cropped.png", Page.Index, i.ToString("D3"))); Imaging.ReleaseGdPictureImage(TempId); //check for words duplication if (RecognizedValue.Length * 2 == Area.Text.Length) { if (Area.Text == RecognizedValue + RecognizedValue) { Area.Text = RecognizedValue; FillCharParams(Area); } } else if (ATAPY.Common.String.MatchRegularExpression(Left, @"^EUR|€|$\d+\s?[.,]?\s?\d*\s?[.,]?\s?\d{2}$")) { Area.Text = Left; FillCharParams(Area); } /*else if (IsEven(Area.Text.Length)) { * var Left = Area.Text.Substring(0, Area.Text.Length / 2); * var Right = Area.Text.Substring(Area.Text.Length / 2); * if (Left == Right && ATAPY.Common.String.MatchRegularExpression(Left, @"^EUR|€|$\d+\s?[.,]?\s?\d*\s?[.,]?\s?\d{2}$")) { * Area.Text = Left; * FillCharParams(Area); * } * }*/ } } }
private void FillPageText(string PageText, GdPicturePDF SourcePDF, Page Page) { GdPictureImaging GdPictureImaging = new GdPictureImaging(); var Words = PageText.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries); SourcePDF.SetOrigin(PdfOrigin.PdfOriginTopLeft); SourcePDF.SetMeasurementUnit(PdfMeasurementUnit.PdfMeasurementUnitPoint); for (int i = 0; i < Words.Length; i++) { string WordSet = Words[i]; try { var Coords = WordSet.Split(SEPARATOR[0]); var Word = CorrectWord(Coords[8]); if (!string.IsNullOrEmpty(Word) && !string.IsNullOrWhiteSpace(Word)) { E_TextOrientation Orientation; System.Windows.Rect Rect = GetRect(Coords, out Orientation); if (Rect.IsEmpty) { continue; } //var asdf = SourcePDF.GetPageTextArea(0, 0, 500, 500); //var AltText = SourcePDF.GetPageTextArea((float)(Rect.Left / Scale / 72.0), (float)(Rect.Top / Scale / 72.0), (float)(Rect.Width / Scale / 72.0), (float)(Rect.Height / Scale / 72.0)); TextArea Area = new TextArea(Rect, Word, Page); Area.Orientation = Orientation; //SetPDFFontSize(SourcePDF, Area, Fonts); if (Orientation == E_TextOrientation.LeftRight) { Page.TextAreas.Add(Area); FillCharParams(Area); } } } catch (Exception ex) { throw ex; } } int ID = GdPictureImaging.CreateNewGdPictureImage((int)Page.Width, (int)Page.Height, 24, System.Drawing.Color.White); foreach (var item in Page.TextAreas) { //calc sizes var FontSize = GetFontSize(GdPictureImaging, ID, item, "Arial"); /*if (item.Text == "-") { * var a = item.FontSize; * }*/ } GdPictureImaging.ReleaseGdPictureImage(ID); }
private void CorrectTextCoordinates(Page Page, int PageImageID) { using (GdPictureImaging Imaging = new GdPictureImaging()) { var Bd = (short)Imaging.GetBitDepth(PageImageID); Dictionary <double, double> AverageNewHeight = new Dictionary <double, double>(); Dictionary <double, int> AverageNewHeightAreas = new Dictionary <double, int>(); for (int i = 0; i < Page.TextAreas.Count; i++) { var Area = Page.TextAreas[i]; int Width = (int)Area.Width + 1; int Height = (int)Area.Height + 1; int TempId = Imaging.CreateNewGdPictureImage(Width, Height, 1, System.Drawing.Color.Transparent); //var ExpectedHeight = Imaging.GetTextHeight(TempId, "W", "Arial", (float)Area.FontSize, GdPicture9.FontStyle.FontStyleRegular); Imaging.DrawGdPictureImageRect(PageImageID, TempId, 0, 0, Width, Height, (int)Area.Bound.Left, (int)Area.Bound.Top, Width, Height, System.Drawing.Drawing2D.InterpolationMode.NearestNeighbor); if (Imaging.GetAverageColor(TempId).Name == "ffffffff") { //transparent or white text printed. if (!AverageNewHeight.ContainsKey(Area.FontSize)) { AverageNewHeight.Add(Area.FontSize, Area.Height); AverageNewHeightAreas.Add(Area.FontSize, 1); } else { AverageNewHeight[Area.FontSize] += Area.Height; AverageNewHeightAreas[Area.FontSize]++; } continue; } bool[] VerticalLines; var LinesSpread = GetAreaLines(TempId, Imaging, Width, Height, out VerticalLines); int TopOffset; int BottomOffset; int LeftOfffset; int RightOffset; bool Is2Dots = (Area.Text == ":" || Area.Text == ";"); /*if (Area.Text == "-") { * Imaging.ReleaseGdPictureImage(TempId); * continue; * }*/ AnalyzeLines(LinesSpread, out TopOffset, out BottomOffset, Is2Dots); AnalyzeVerticalLines(VerticalLines, out LeftOfffset, out RightOffset); /*Imaging.SaveAsPNG(TempId, @"d:\Temp\P2P\" + string.Format("Page_{0}_Word_{1}.png", Page.Index, i.ToString("D3"))); * var stat = Imaging.Crop(TempId, LeftOfffset, TopOffset + 1, Width - LeftOfffset - RightOffset, Height - BottomOffset - TopOffset - 1); * Imaging.SaveAsPNG(TempId, @"d:\Temp\P2P\" + string.Format("Page_{0}_Word_{1}_Cropped.png", Page.Index, i.ToString("D3")));*/ /*if (Area.Width - LeftOfffset - RightOffset <= 0) { * //var res = Imaging.SaveAsPNG(TempId, @"d:\Test.png"); * int a = 1; * } * if (Area.Height - TopOffset - BottomOffset <= 0) { * int b = 1; * }*/ Imaging.ReleaseGdPictureImage(TempId); Area.Bound = new Rect(Area.Bound.Left + LeftOfffset, Area.Bound.Top + TopOffset, Area.Width - LeftOfffset - RightOffset, Area.Height - TopOffset - BottomOffset); //FillCharParams(Area); if (!AverageNewHeight.ContainsKey(Area.FontSize)) { AverageNewHeight.Add(Area.FontSize, Area.Height - TopOffset - BottomOffset); AverageNewHeightAreas.Add(Area.FontSize, 1); } else { AverageNewHeight[Area.FontSize] += Area.Height - TopOffset - BottomOffset; AverageNewHeightAreas[Area.FontSize]++; } } Dictionary <double, double> AverageNewFontSize = new Dictionary <double, double>(); foreach (var item in AverageNewHeight) { AverageNewFontSize.Add(item.Key, item.Value / AverageNewHeightAreas[item.Key]); } foreach (var Area in Page.TextAreas) { Area.FontSize = AverageNewFontSize[Area.FontSize]; if (Area.Text == "-") { int TempId = Imaging.CreateNewGdPictureImage((int)Area.Width, (int)Area.Width, 1, System.Drawing.Color.Transparent); var ExpectedHeight = Imaging.GetTextHeight(TempId, "W", "Arial", (float)Area.FontSize, GdPicture9.FontStyle.FontStyleRegular); int Offset = (int)((ExpectedHeight - Area.Height) / 2.0) + 1; Imaging.ReleaseGdPictureImage(TempId); Area.Bound = new Rect(Area.Bound.Left, Area.Bound.Top - Offset, Area.Width, Area.Height + 2 * Offset); } FillCharParams(Area); } } }