private static void LoadOcrDataOriginal(ICollection <OcrWord> data, ICollection <OcrLine> lines, string ocrFile) { int skipChars = 0; var ocrLine = new OcrLine(); foreach (var line in File.ReadLines(ocrFile)) { if (skipChars-- > 0) { continue; } var wordLine = line.Split(new char[] { '\t' }, StringSplitOptions.RemoveEmptyEntries); if (wordLine.Length < 3) { if (line.Contains("Line") == true) { if (ocrLine.Words.Count > 0) { lines.Add(ocrLine); var x = from c in ocrLine.Words select c.Rectangle; ocrLine.Rectangle = x.Aggregate((accumlator, tested) => { if (accumlator.IsEmpty == true) { accumlator = tested; } else { accumlator.Union(tested); } return(accumlator); }); } ocrLine = new OcrLine(); } continue; } var wrd = new OcrWord() { Contents = wordLine[0], Rectangle = GetRect(wordLine[1]), Confidence = int.Parse(wordLine[2], NumberStyles.Integer) }; data.Add(wrd); // ocrLine.Words.Add(wrd); wrd.Line = ocrLine; skipChars = wrd.Contents.Length; } }
private static void LoadOcrData(DocumentData doc, string ocrFile, bool deskew) { var page = TPage.LoadFromPRD(ocrFile); //*.prd // var pageNavigation = new TPageNavigation(page); var data = doc.Words; var lines = doc.Lines; /* if (deskew) * { * foreach(FieldData field in doc.Fields) * { * field.OriginRectangle = field.Rectangle; * TOCRRect origin = new TOCRRect((int)field.Rectangle.Left,(int)field.Rectangle.Top,(int)field.Rectangle.Width,(int)field.Rectangle.Height); * var originSkew = page.Deskew( origin); * field.Rectangle = new Rect(originSkew.Left, originSkew.Top, originSkew.Width, originSkew.Height); * } * }*/ List <TOCRRect> wordsRects = new List <TOCRRect>(); foreach (TLine line in page.Lines) { OcrLine ocrLine = new OcrLine(); ocrLine.Rectangle = new Rect(line.Rect.Left, line.Rect.Top, line.Rect.Width, line.Rect.Height); foreach (TWord word in line.Words) { if (wordsRects.Any(x => x.Left == word.Rect.Left && x.Top == word.Rect.Top && x.Right == word.Rect.Right && x.Bottom == word.Rect.Bottom) == true) { continue; } else { wordsRects.Add(word.Rect); } // Split word IList <MappedWord> spltWords = SplitWord(word) .Select(x => { var mapWord = new MappedWord() { Confidence = x.Confidance, Contents = string.Concat(x.Chars.Select(a => a.CharData).ToArray()) }; TOCRRect deskewedRect = x.Rect; if (deskew) { deskewedRect = page.Deskew(x.Rect); } mapWord.Rectangle = new Rect(deskewedRect.Left, deskewedRect.Top, deskewedRect.Width, deskewedRect.Height); mapWord.OriginRectangle = new Rect(x.Rect.Left, x.Rect.Top, x.Rect.Width, x.Rect.Height); return(mapWord); }) .ToList(); // Add splited each words to the doc for (int i = 0; i < spltWords.Count; i++) { MappedWord mapWord = spltWords[i]; if (i > 0) { spltWords[i - 1].SplitRight = mapWord; } if (i + 1 < spltWords.Count) { spltWords[i + 1].SplitLeft = mapWord; } doc.Words.Add(mapWord); mapWord.Line = ocrLine; ocrLine.Words.Add(mapWord); } } if (ocrLine.Words.Any()) { lines.Add(ocrLine); } } int indexWord = 0; int indexLine = 0; foreach (var ocrLine in lines.OrderBy(a => a.Rectangle.Top).ToList()) { ocrLine.ID = indexLine++; } data = data.OrderBy(a => a.Line.ID).ThenBy(a => a.Rectangle.X).ToList(); foreach (OcrWord ocrWord in data) { ocrWord.ID = indexWord++; } }