public HDocument ParseHocr(HDocument hOrcDoc, string hOcrFile, bool append) { _hDoc = hOrcDoc; if (_doc == null) { _doc = new HtmlDocument(); } _hOcrFilePath = hOcrFile; if (File.Exists(hOcrFile) == false) { throw new Exception("hocr file not found"); } _currentPage = null; _currentPara = null; _currentLine = null; _doc.Load(hOcrFile, Encoding.UTF8); HtmlNode body = _doc.DocumentNode.SelectNodes("//body")[0]; HtmlNodeCollection nodes1 = body.SelectNodes("//div"); //#Issue #1 reported by Ryan-George IEnumerable <HtmlNode> divs = body.ChildNodes.Where(node => node.Name.ToLower() == "div"); HtmlNodeCollection nodes = new HtmlNodeCollection(null); foreach (HtmlNode div in divs) { nodes.Add(div); } _hDoc.ClassName = "body"; ParseNodes(nodes); return(_hDoc); }
private void ParseNodes(HtmlNodeCollection nodes) { foreach (HtmlNode node in nodes.ToList()) { if (node.HasAttributes) { string className = string.Empty; string title = string.Empty; string id = string.Empty; if (node.Attributes["class"] != null) { className = node.Attributes["class"].Value; } if (node.Attributes["title"] != null) { title = node.Attributes["title"].Value; } if (node.Attributes["Id"] != null) { id = node.Attributes["Id"].Value; } switch (className) { case "ocr_page": _currentPage = new HPage(); _currentPage.ClassName = className; _currentPage.Id = id; ParseTitle(title, _currentPage); _currentPage.Text = node.InnerText; _hDoc.Pages.Add(_currentPage); break; case "ocr_par": _currentPara = new HParagraph(); _currentPara.ClassName = className; _currentPara.Id = id; ParseTitle(title, _currentPara); _currentPara.Text = node.InnerText; _currentPage.Paragraphs.Add(_currentPara); break; case "ocr_line": _currentLine = new HLine(_dpi); _currentLine.ClassName = className; _currentLine.Id = id; ParseTitle(title, _currentLine); _currentLine.Text = node.InnerText; if (_currentPage == null) { _currentPage = new HPage(); } if (_currentPara == null) { _currentPara = new HParagraph(); _currentPage.Paragraphs.Add(_currentPara); } _currentPara.Lines.Add(_currentLine); break; case "ocrx_word": HWord w = new HWord(); w.ClassName = className; w.Id = id; ParseTitle(title, w); w.Text = node.InnerText; _currentLine.Words.Add(w); break; case "ocr_word": HWord w1 = new HWord(); w1.ClassName = className; w1.Id = id; ParseTitle(title, w1); w1.Text = node.InnerText; _currentLine.Words.Add(w1); break; case "ocr_cinfo": //cuneiform only ParseCharactersForLine(title); break; } } ParseNodes(node.ChildNodes); } }