public DocumentCandidateNavigation(DocumentData pageOcr) { PageOcr = pageOcr; Init(); }
public DocumentDataNavigation(DocumentData pageOcr) { PageOcr = pageOcr; Init(); }
private static void LoadOcrData(DocumentData doc, string ocrFile, bool deskew) { var page = TPage.LoadFromPRD(ocrFile); //*.prd // var pageNavigation = new TPageNavigation(page); var data = doc.Words; var lines = doc.Lines; /* if (deskew) * { * foreach(FieldData field in doc.Fields) * { * field.OriginRectangle = field.Rectangle; * TOCRRect origin = new TOCRRect((int)field.Rectangle.Left,(int)field.Rectangle.Top,(int)field.Rectangle.Width,(int)field.Rectangle.Height); * var originSkew = page.Deskew( origin); * field.Rectangle = new Rect(originSkew.Left, originSkew.Top, originSkew.Width, originSkew.Height); * } * }*/ List <TOCRRect> wordsRects = new List <TOCRRect>(); foreach (TLine line in page.Lines) { OcrLine ocrLine = new OcrLine(); ocrLine.Rectangle = new Rect(line.Rect.Left, line.Rect.Top, line.Rect.Width, line.Rect.Height); foreach (TWord word in line.Words) { if (wordsRects.Any(x => x.Left == word.Rect.Left && x.Top == word.Rect.Top && x.Right == word.Rect.Right && x.Bottom == word.Rect.Bottom) == true) { continue; } else { wordsRects.Add(word.Rect); } // Split word IList <MappedWord> spltWords = SplitWord(word) .Select(x => { var mapWord = new MappedWord() { Confidence = x.Confidance, Contents = string.Concat(x.Chars.Select(a => a.CharData).ToArray()) }; TOCRRect deskewedRect = x.Rect; if (deskew) { deskewedRect = page.Deskew(x.Rect); } mapWord.Rectangle = new Rect(deskewedRect.Left, deskewedRect.Top, deskewedRect.Width, deskewedRect.Height); mapWord.OriginRectangle = new Rect(x.Rect.Left, x.Rect.Top, x.Rect.Width, x.Rect.Height); return(mapWord); }) .ToList(); // Add splited each words to the doc for (int i = 0; i < spltWords.Count; i++) { MappedWord mapWord = spltWords[i]; if (i > 0) { spltWords[i - 1].SplitRight = mapWord; } if (i + 1 < spltWords.Count) { spltWords[i + 1].SplitLeft = mapWord; } doc.Words.Add(mapWord); mapWord.Line = ocrLine; ocrLine.Words.Add(mapWord); } } if (ocrLine.Words.Any()) { lines.Add(ocrLine); } } int indexWord = 0; int indexLine = 0; foreach (var ocrLine in lines.OrderBy(a => a.Rectangle.Top).ToList()) { ocrLine.ID = indexLine++; } data = data.OrderBy(a => a.Line.ID).ThenBy(a => a.Rectangle.X).ToList(); foreach (OcrWord ocrWord in data) { ocrWord.ID = indexWord++; } }
public static List <FieldClusterLine> CreateLines(DocumentData doc, fieldClusterModel cluster) { DocumentDataNavigation docNavigate = doc.WordsNavigator; var wordList = cluster.Fields; var wordHeight = CalcWordHeight(wordList); List <FieldClusterLine> result = new List <FieldClusterLine>(); wordList = wordList.OrderBy(a => a.Rectangle.Top).ToList(); List <MappedWord> newWordList = new List <MappedWord>(wordList); Dictionary <MappedWord, List <MappedWord> > wordLines = new Dictionary <MappedWord, List <MappedWord> >(); foreach (var word in newWordList) { var inTheSameLine = docNavigate.GetWords(new System.Windows.Rect(cluster.Area.Left, word.Rectangle.Top, cluster.Area.Width, word.Rectangle.Height)) .Where(x => x.Cluster.ID == word.Cluster.ID && x.Clusterline == null) .Where(x => x == word || x.Rectangle.Top + (x.Rectangle.Height / 2) * 1.2 < word.Rectangle.Bottom) .ToList(); if (inTheSameLine.Count == 0) { throw new Exception("A) No words in word line"); } wordLines.Add(word, inTheSameLine); } int index = 0; while (newWordList.Count > 0) { MappedWord WordTop = newWordList.FirstOrDefault(x => x.Rectangle.Height < wordHeight * 2); if (WordTop == null) { WordTop = newWordList.First(); } var baseWords = wordLines[WordTop].Where(x => x.Clusterline == null).ToList(); if (baseWords.Count == 0) { // Bad word throw new Exception("B) No words in word line"); } FieldClusterLine FieldLine = new FieldClusterLine(); FieldLine.ID = index++; FieldLine.Fields = wordLines.Where(x => baseWords.Contains(x.Key)) .SelectMany(x => x.Value) .Where(x => x.Clusterline == null) .Distinct() .OrderBy(x => x.Rectangle.Left) .ToList(); newWordList.RemoveAll(a => FieldLine.Fields.Contains(a)); //FieldLine.Fields = newWordList.Where(a => WordTop.Line.Words.Contains(a)).ToList().OrderBy(b => b.Rectangle.X).ToList(); // FieldLine.Fields = newWordList.Where(a => (a.Rectangle.Top <= (WordTop.Rectangle.Top + (WordTop.Rectangle.Height * PrecentAsLine)))).ToList().OrderBy(b=>b.Rectangle.X).ToList(); FieldLine.Fields.ForEach(a => a.Clusterline = FieldLine); result.Add(FieldLine); } return(result); }
public static List <fieldClusterModel> ClusterListFromDoc(DocumentData doc) { List <fieldClusterModel> result = new List <fieldClusterModel>(); doc.Words.ToList().ForEach(a => result.Add(new fieldClusterModel((MappedWord)a))); bool change = false; do { change = false; for (int i = 0; i < result.Count(); i++) { var fieldCluste1 = result.ElementAt(i); if (fieldCluste1.isEmpty == false) { for (int j = 0; j < result.Count(); j++) { if (i == j) { continue; } var fieldCluste2 = result.ElementAt(j); if (fieldCluste2.isEmpty == false) { if (fieldCluste2.AddIfIntersect(fieldCluste1)) { fieldCluste1.Clear(); change = true; break; } } } } } result.RemoveAll(a => a.isEmpty); } while (change == true); change = false; do { change = false; for (int i = 0; i < result.Count(); i++) { var fieldCluste1 = result.ElementAt(i); if (fieldCluste1.isEmpty == false) { for (int j = 0; j < result.Count(); j++) { if (i == j) { continue; } var fieldCluste2 = result.ElementAt(j); if (fieldCluste2.isEmpty == false) { if (fieldCluste2.AddIfInside(fieldCluste1)) { fieldCluste1.Clear(); change = true; break; } } } } } result.RemoveAll(a => a.isEmpty); } while (change == true); int index = 0; result.ForEach(a => a.ID = index++); foreach (MappedWord word in doc.Words) { word.Cluster = result.Where(a => a.Fields.Contains(word)).FirstOrDefault(); } result.ForEach(a => a.lines = LineEngine.CreateLines(doc, a)); return(result); }