public IList <HLine> CombineSameRowLines() { IList <HLine> lines = new List <HLine>(); foreach (HParagraph p in Paragraphs) { foreach (HLine l in p.Lines) { if (lines.All(x => x.Id != l.Id)) { lines.Add(l); } } } IList <HLine> results = new List <HLine>(); IOrderedEnumerable <HLine> sortedLines = lines.OrderBy(x => x.BBox.Top); foreach (HLine l in sortedLines) { l.CleanText(); List <HLine> linesOnThisLine = lines.Where(x => Math.Abs(x.BBox.DefaultPointBBox.Top - l.BBox.DefaultPointBBox.Top) <= 2) .OrderBy(x => x.BBox.Left) .Distinct().ToList(); if (linesOnThisLine.Select(x => x.Id.Trim()).Distinct().Count() > 1) { l.LinesInSameSentence = linesOnThisLine; } HLine c = l.CombineLinesInSentence(); if (results.All(x => x.Id != c.Id)) { results.Add(c); } } AverageWordCountPerLine = Convert.ToInt32(Math.Ceiling(results.Select(x => x.Words.Count).Average())); return(results.OrderBy(x => x.BBox.Top).Distinct().ToList()); }
public HDocument ParseHocr(HDocument hOrcDoc, string hOcrFile, bool append) { _hDoc = hOrcDoc; if (_doc == null) { _doc = new HtmlDocument(); } _hOcrFilePath = hOcrFile; if (File.Exists(hOcrFile) == false) { throw new Exception("hocr file not found"); } _currentPage = null; _currentPara = null; _currentLine = null; _doc.Load(hOcrFile, Encoding.UTF8); HtmlNode body = _doc.DocumentNode.SelectNodes("//body")[0]; HtmlNodeCollection nodes1 = body.SelectNodes("//div"); //#Issue #1 reported by Ryan-George IEnumerable <HtmlNode> divs = body.ChildNodes.Where(node => node.Name.ToLower() == "div"); HtmlNodeCollection nodes = new HtmlNodeCollection(null); foreach (HtmlNode div in divs) { nodes.Add(div); } _hDoc.ClassName = "body"; ParseNodes(nodes); return(_hDoc); }
private void ParseNodes(HtmlNodeCollection nodes) { foreach (HtmlNode node in nodes.ToList()) { if (node.HasAttributes) { string className = string.Empty; string title = string.Empty; string id = string.Empty; if (node.Attributes["class"] != null) { className = node.Attributes["class"].Value; } if (node.Attributes["title"] != null) { title = node.Attributes["title"].Value; } if (node.Attributes["Id"] != null) { id = node.Attributes["Id"].Value; } switch (className) { case "ocr_page": _currentPage = new HPage(); _currentPage.ClassName = className; _currentPage.Id = id; ParseTitle(title, _currentPage); _currentPage.Text = node.InnerText; _hDoc.Pages.Add(_currentPage); break; case "ocr_par": _currentPara = new HParagraph(); _currentPara.ClassName = className; _currentPara.Id = id; ParseTitle(title, _currentPara); _currentPara.Text = node.InnerText; _currentPage.Paragraphs.Add(_currentPara); break; case "ocr_line": _currentLine = new HLine(_dpi); _currentLine.ClassName = className; _currentLine.Id = id; ParseTitle(title, _currentLine); _currentLine.Text = node.InnerText; if (_currentPage == null) { _currentPage = new HPage(); } if (_currentPara == null) { _currentPara = new HParagraph(); _currentPage.Paragraphs.Add(_currentPara); } _currentPara.Lines.Add(_currentLine); break; case "ocrx_word": HWord w = new HWord(); w.ClassName = className; w.Id = id; ParseTitle(title, w); w.Text = node.InnerText; _currentLine.Words.Add(w); break; case "ocr_word": HWord w1 = new HWord(); w1.ClassName = className; w1.Id = id; ParseTitle(title, w1); w1.Text = node.InnerText; _currentLine.Words.Add(w1); break; case "ocr_cinfo": //cuneiform only ParseCharactersForLine(title); break; } } ParseNodes(node.ChildNodes); } }