Beispiel #1
0
        public async Task <List <TextPage> > DecodeHocr(string text)
        {
            return(await Task.Run(async() =>
            {
                var divName = XName.Get("div");

                List <TextPage> textPages = new List <TextPage>();

                XDocument textXml = XDocument.Parse(text);

                IEnumerable <XElement> pages = textXml.Descendants(divName)
                                               .Where(x => (string)x.Attribute("class") == "ocr_page");


                foreach (XElement page in pages)
                {
                    TextPage textPage = new TextPage();

                    XAttribute coords = page.Attribute("title");

                    if (coords != null)
                    {
                        string[] coordlist = coords.Value.Split(' ');

                        textPage.X = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[3]));
                        textPage.Y = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[4]));
                        textPage.Width = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[5])) -
                                         Convert.ToInt32(HelperOcr.GetNumbers(coordlist[3]));
                        textPage.Height = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[6])) -
                                          Convert.ToInt32(HelperOcr.GetNumbers(coordlist[4]));
                    }

                    XAttribute id = page.Attribute("id");

                    if (id != null)
                    {
                        textPage.Id = id.Value;
                    }

                    List <TextParagraph> textParagraphs = await Task.Run(() => new DecodeParagraphs().Decode(page));

                    textPage.Paragraphs.AddRange(textParagraphs);

                    textPages.Add(textPage);
                }

                return textPages;
            }));
        }
Beispiel #2
0
        public List <TextPage> Decode(string text)
        {
            List <TextPage> textPages = new List <TextPage>();

            XDocument textXml = XDocument.Parse(text);

            IEnumerable <XElement> pages = textXml.Descendants(_divName)
                                           .Where(x => (string)x.Attribute("class") == "ocr_page");

            foreach (XElement page in pages)
            {
                TextPage textPage = new TextPage();

                XAttribute coords = page.Attribute("title");

                if (coords != null)
                {
                    string[] coordlist = coords.Value.Split(' ');

                    textPage.X      = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[3]));
                    textPage.Y      = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[4]));
                    textPage.Width  = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[5])) - Convert.ToInt32(HelperOcr.GetNumbers(coordlist[3]));
                    textPage.Height = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[6])) - Convert.ToInt32(HelperOcr.GetNumbers(coordlist[4]));
                }



                XAttribute id = page.Attribute("id");

                if (id != null)
                {
                    textPage.Id = id.Value;
                }

                List <TextParagraph> textParagraphs = new DecodeParagraphs().Decode(page);

                textPage.Paragraphs.AddRange(textParagraphs);

                textPages.Add(textPage);
            }

            SerializePagesList.Serialize(textPages);

            return(textPages);
        }
Beispiel #3
0
        public List <TextWord> Decode(XElement line)
        {
            List <TextWord> textWords = new List <TextWord>();

            IEnumerable <XElement> words = line.Descendants(_spanName);

            foreach (XElement word in words)
            {
                TextWord textWord = new TextWord();

                XAttribute coords = word.Attribute("title");

                if (coords != null)
                {
                    var coordlist = coords.Value.Split(' ');

                    textWord.X      = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[1]));
                    textWord.Y      = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[2]));
                    textWord.Width  = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[3])) - Convert.ToInt32(HelperOcr.GetNumbers(coordlist[1]));
                    textWord.Height = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[4])) - Convert.ToInt32(HelperOcr.GetNumbers(coordlist[2]));
                }

                XAttribute id = word.Attribute("id");

                if (word.Elements(XName.Get("strong")).FirstOrDefault(d => !d.IsEmpty) != null)
                {
                    textWord.Bold = true;
                }
                ;

                if (id != null)
                {
                    textWord.id = id.Value;
                }
                textWord.Word = word.Value;

                textWords.Add(textWord);
            }

            return(textWords);
        }
Beispiel #4
0
        public List <TextParagraph> Decode(XElement element)
        {
            List <TextParagraph> textParagraphs = new List <TextParagraph>();

            IEnumerable <XElement> paragraphs = element.Descendants(_paraName)
                                                .Where(x => (string)x.Attribute("class") == "ocr_par");

            foreach (XElement paragraph in paragraphs)
            {
                TextParagraph textParagraph = new TextParagraph();

                XAttribute coords = paragraph.Attribute("title");
                if (coords != null)
                {
                    string[] coordlist = coords.Value.Split(' ');

                    textParagraph.X      = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[1]));
                    textParagraph.Y      = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[2]));
                    textParagraph.Width  = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[3])) - Convert.ToInt32(HelperOcr.GetNumbers(coordlist[1]));
                    textParagraph.Height = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[4])) - Convert.ToInt32(HelperOcr.GetNumbers(coordlist[2]));
                }

                XAttribute id = paragraph.Attribute("id");

                if (id != null)
                {
                    textParagraph.id = id.Value;
                }

                List <TextLine> textLines = new DecodeLines().Decode(paragraph);

                textParagraph.Lines.AddRange(textLines);

                textParagraphs.Add(textParagraph);
            }

            return(textParagraphs);
        }
Beispiel #5
0
        public List <TextLine> Decode(XElement paragraph)
        {
            List <TextLine> textLines = new List <TextLine>();

            IEnumerable <XElement> lines = paragraph.Descendants(_spanName)
                                           .Where(x => (string)x.Attribute("class") == "ocr_line");

            foreach (XElement line in lines)
            {
                TextLine   textLine = new TextLine();
                XAttribute coords   = line.Attribute("title");
                if (coords != null)
                {
                    string[] coordlist = coords.Value.Split(' ');

                    textLine.X      = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[1]));
                    textLine.Y      = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[2]));
                    textLine.Width  = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[3])) - Convert.ToInt32(HelperOcr.GetNumbers(coordlist[1]));
                    textLine.Height = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[4])) - Convert.ToInt32(HelperOcr.GetNumbers(coordlist[2]));
                }

                XAttribute id = line.Attribute("id");

                if (id != null)
                {
                    textLine.id = id.Value;
                }

                List <TextWord> textWords = new DecodeWords().Decode(line);

                textLine.Words.AddRange(textWords);

                textLines.Add(textLine);
            }

            return(textLines);
        }