Beispiel #1
0
        public HDocument ParseHocr(HDocument hOrcDoc, string hOcrFile, bool append)
        {
            _hDoc = hOrcDoc;

            if (_doc == null)
            {
                _doc = new HtmlDocument();
            }

            _hOcrFilePath = hOcrFile;
            if (File.Exists(hOcrFile) == false)
            {
                throw new Exception("hocr file not found");
            }

            _currentPage = null;
            _currentPara = null;
            _currentLine = null;

            _doc.Load(hOcrFile, Encoding.UTF8);


            HtmlNode           body   = _doc.DocumentNode.SelectNodes("//body")[0];
            HtmlNodeCollection nodes1 = body.SelectNodes("//div");
            //#Issue #1 reported by Ryan-George
            IEnumerable <HtmlNode> divs  = body.ChildNodes.Where(node => node.Name.ToLower() == "div");
            HtmlNodeCollection     nodes = new HtmlNodeCollection(null);

            foreach (HtmlNode div in divs)
            {
                nodes.Add(div);
            }

            _hDoc.ClassName = "body";

            ParseNodes(nodes);
            return(_hDoc);
        }
Beispiel #2
0
        private void ParseNodes(HtmlNodeCollection nodes)
        {
            foreach (HtmlNode node in nodes.ToList())
            {
                if (node.HasAttributes)
                {
                    string className = string.Empty;
                    string title     = string.Empty;
                    string id        = string.Empty;

                    if (node.Attributes["class"] != null)
                    {
                        className = node.Attributes["class"].Value;
                    }
                    if (node.Attributes["title"] != null)
                    {
                        title = node.Attributes["title"].Value;
                    }
                    if (node.Attributes["Id"] != null)
                    {
                        id = node.Attributes["Id"].Value;
                    }

                    switch (className)
                    {
                    case "ocr_page":
                        _currentPage           = new HPage();
                        _currentPage.ClassName = className;
                        _currentPage.Id        = id;
                        ParseTitle(title, _currentPage);
                        _currentPage.Text = node.InnerText;
                        _hDoc.Pages.Add(_currentPage);
                        break;

                    case "ocr_par":
                        _currentPara           = new HParagraph();
                        _currentPara.ClassName = className;
                        _currentPara.Id        = id;
                        ParseTitle(title, _currentPara);
                        _currentPara.Text = node.InnerText;
                        _currentPage.Paragraphs.Add(_currentPara);
                        break;

                    case "ocr_line":
                        _currentLine           = new HLine(_dpi);
                        _currentLine.ClassName = className;
                        _currentLine.Id        = id;
                        ParseTitle(title, _currentLine);
                        _currentLine.Text = node.InnerText;
                        if (_currentPage == null)
                        {
                            _currentPage = new HPage();
                        }
                        if (_currentPara == null)
                        {
                            _currentPara = new HParagraph();
                            _currentPage.Paragraphs.Add(_currentPara);
                        }

                        _currentPara.Lines.Add(_currentLine);
                        break;

                    case "ocrx_word":
                        HWord w = new HWord();
                        w.ClassName = className;
                        w.Id        = id;
                        ParseTitle(title, w);
                        w.Text = node.InnerText;
                        _currentLine.Words.Add(w);
                        break;

                    case "ocr_word":
                        HWord w1 = new HWord();
                        w1.ClassName = className;
                        w1.Id        = id;
                        ParseTitle(title, w1);
                        w1.Text = node.InnerText;
                        _currentLine.Words.Add(w1);
                        break;

                    case "ocr_cinfo":     //cuneiform only
                        ParseCharactersForLine(title);
                        break;
                    }
                }

                ParseNodes(node.ChildNodes);
            }
        }