Exemple #1
0
        public IList <HLine> CombineSameRowLines()
        {
            IList <HLine> lines = new List <HLine>();

            foreach (HParagraph p in Paragraphs)
            {
                foreach (HLine l in p.Lines)
                {
                    if (lines.All(x => x.Id != l.Id))
                    {
                        lines.Add(l);
                    }
                }
            }

            IList <HLine> results = new List <HLine>();

            IOrderedEnumerable <HLine> sortedLines = lines.OrderBy(x => x.BBox.Top);

            foreach (HLine l in sortedLines)
            {
                l.CleanText();

                List <HLine> linesOnThisLine = lines.Where(x => Math.Abs(x.BBox.DefaultPointBBox.Top - l.BBox.DefaultPointBBox.Top) <= 2)
                                               .OrderBy(x => x.BBox.Left)
                                               .Distinct().ToList();

                if (linesOnThisLine.Select(x => x.Id.Trim()).Distinct().Count() > 1)
                {
                    l.LinesInSameSentence = linesOnThisLine;
                }

                HLine c = l.CombineLinesInSentence();

                if (results.All(x => x.Id != c.Id))
                {
                    results.Add(c);
                }
            }

            AverageWordCountPerLine = Convert.ToInt32(Math.Ceiling(results.Select(x => x.Words.Count).Average()));

            return(results.OrderBy(x => x.BBox.Top).Distinct().ToList());
        }
Exemple #2
0
        public HDocument ParseHocr(HDocument hOrcDoc, string hOcrFile, bool append)
        {
            _hDoc = hOrcDoc;

            if (_doc == null)
            {
                _doc = new HtmlDocument();
            }

            _hOcrFilePath = hOcrFile;
            if (File.Exists(hOcrFile) == false)
            {
                throw new Exception("hocr file not found");
            }

            _currentPage = null;
            _currentPara = null;
            _currentLine = null;

            _doc.Load(hOcrFile, Encoding.UTF8);


            HtmlNode           body   = _doc.DocumentNode.SelectNodes("//body")[0];
            HtmlNodeCollection nodes1 = body.SelectNodes("//div");
            //#Issue #1 reported by Ryan-George
            IEnumerable <HtmlNode> divs  = body.ChildNodes.Where(node => node.Name.ToLower() == "div");
            HtmlNodeCollection     nodes = new HtmlNodeCollection(null);

            foreach (HtmlNode div in divs)
            {
                nodes.Add(div);
            }

            _hDoc.ClassName = "body";

            ParseNodes(nodes);
            return(_hDoc);
        }
Exemple #3
0
        private void ParseNodes(HtmlNodeCollection nodes)
        {
            foreach (HtmlNode node in nodes.ToList())
            {
                if (node.HasAttributes)
                {
                    string className = string.Empty;
                    string title     = string.Empty;
                    string id        = string.Empty;

                    if (node.Attributes["class"] != null)
                    {
                        className = node.Attributes["class"].Value;
                    }
                    if (node.Attributes["title"] != null)
                    {
                        title = node.Attributes["title"].Value;
                    }
                    if (node.Attributes["Id"] != null)
                    {
                        id = node.Attributes["Id"].Value;
                    }

                    switch (className)
                    {
                    case "ocr_page":
                        _currentPage           = new HPage();
                        _currentPage.ClassName = className;
                        _currentPage.Id        = id;
                        ParseTitle(title, _currentPage);
                        _currentPage.Text = node.InnerText;
                        _hDoc.Pages.Add(_currentPage);
                        break;

                    case "ocr_par":
                        _currentPara           = new HParagraph();
                        _currentPara.ClassName = className;
                        _currentPara.Id        = id;
                        ParseTitle(title, _currentPara);
                        _currentPara.Text = node.InnerText;
                        _currentPage.Paragraphs.Add(_currentPara);
                        break;

                    case "ocr_line":
                        _currentLine           = new HLine(_dpi);
                        _currentLine.ClassName = className;
                        _currentLine.Id        = id;
                        ParseTitle(title, _currentLine);
                        _currentLine.Text = node.InnerText;
                        if (_currentPage == null)
                        {
                            _currentPage = new HPage();
                        }
                        if (_currentPara == null)
                        {
                            _currentPara = new HParagraph();
                            _currentPage.Paragraphs.Add(_currentPara);
                        }

                        _currentPara.Lines.Add(_currentLine);
                        break;

                    case "ocrx_word":
                        HWord w = new HWord();
                        w.ClassName = className;
                        w.Id        = id;
                        ParseTitle(title, w);
                        w.Text = node.InnerText;
                        _currentLine.Words.Add(w);
                        break;

                    case "ocr_word":
                        HWord w1 = new HWord();
                        w1.ClassName = className;
                        w1.Id        = id;
                        ParseTitle(title, w1);
                        w1.Text = node.InnerText;
                        _currentLine.Words.Add(w1);
                        break;

                    case "ocr_cinfo":     //cuneiform only
                        ParseCharactersForLine(title);
                        break;
                    }
                }

                ParseNodes(node.ChildNodes);
            }
        }