public async Task <List <TextPage> > DecodeHocr(string text) { return(await Task.Run(async() => { var divName = XName.Get("div"); List <TextPage> textPages = new List <TextPage>(); XDocument textXml = XDocument.Parse(text); IEnumerable <XElement> pages = textXml.Descendants(divName) .Where(x => (string)x.Attribute("class") == "ocr_page"); foreach (XElement page in pages) { TextPage textPage = new TextPage(); XAttribute coords = page.Attribute("title"); if (coords != null) { string[] coordlist = coords.Value.Split(' '); textPage.X = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[3])); textPage.Y = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[4])); textPage.Width = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[5])) - Convert.ToInt32(HelperOcr.GetNumbers(coordlist[3])); textPage.Height = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[6])) - Convert.ToInt32(HelperOcr.GetNumbers(coordlist[4])); } XAttribute id = page.Attribute("id"); if (id != null) { textPage.Id = id.Value; } List <TextParagraph> textParagraphs = await Task.Run(() => new DecodeParagraphs().Decode(page)); textPage.Paragraphs.AddRange(textParagraphs); textPages.Add(textPage); } return textPages; })); }
public List <TextPage> Decode(string text) { List <TextPage> textPages = new List <TextPage>(); XDocument textXml = XDocument.Parse(text); IEnumerable <XElement> pages = textXml.Descendants(_divName) .Where(x => (string)x.Attribute("class") == "ocr_page"); foreach (XElement page in pages) { TextPage textPage = new TextPage(); XAttribute coords = page.Attribute("title"); if (coords != null) { string[] coordlist = coords.Value.Split(' '); textPage.X = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[3])); textPage.Y = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[4])); textPage.Width = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[5])) - Convert.ToInt32(HelperOcr.GetNumbers(coordlist[3])); textPage.Height = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[6])) - Convert.ToInt32(HelperOcr.GetNumbers(coordlist[4])); } XAttribute id = page.Attribute("id"); if (id != null) { textPage.Id = id.Value; } List <TextParagraph> textParagraphs = new DecodeParagraphs().Decode(page); textPage.Paragraphs.AddRange(textParagraphs); textPages.Add(textPage); } SerializePagesList.Serialize(textPages); return(textPages); }
public List <TextWord> Decode(XElement line) { List <TextWord> textWords = new List <TextWord>(); IEnumerable <XElement> words = line.Descendants(_spanName); foreach (XElement word in words) { TextWord textWord = new TextWord(); XAttribute coords = word.Attribute("title"); if (coords != null) { var coordlist = coords.Value.Split(' '); textWord.X = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[1])); textWord.Y = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[2])); textWord.Width = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[3])) - Convert.ToInt32(HelperOcr.GetNumbers(coordlist[1])); textWord.Height = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[4])) - Convert.ToInt32(HelperOcr.GetNumbers(coordlist[2])); } XAttribute id = word.Attribute("id"); if (word.Elements(XName.Get("strong")).FirstOrDefault(d => !d.IsEmpty) != null) { textWord.Bold = true; } ; if (id != null) { textWord.id = id.Value; } textWord.Word = word.Value; textWords.Add(textWord); } return(textWords); }
public List <TextParagraph> Decode(XElement element) { List <TextParagraph> textParagraphs = new List <TextParagraph>(); IEnumerable <XElement> paragraphs = element.Descendants(_paraName) .Where(x => (string)x.Attribute("class") == "ocr_par"); foreach (XElement paragraph in paragraphs) { TextParagraph textParagraph = new TextParagraph(); XAttribute coords = paragraph.Attribute("title"); if (coords != null) { string[] coordlist = coords.Value.Split(' '); textParagraph.X = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[1])); textParagraph.Y = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[2])); textParagraph.Width = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[3])) - Convert.ToInt32(HelperOcr.GetNumbers(coordlist[1])); textParagraph.Height = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[4])) - Convert.ToInt32(HelperOcr.GetNumbers(coordlist[2])); } XAttribute id = paragraph.Attribute("id"); if (id != null) { textParagraph.id = id.Value; } List <TextLine> textLines = new DecodeLines().Decode(paragraph); textParagraph.Lines.AddRange(textLines); textParagraphs.Add(textParagraph); } return(textParagraphs); }
public List <TextLine> Decode(XElement paragraph) { List <TextLine> textLines = new List <TextLine>(); IEnumerable <XElement> lines = paragraph.Descendants(_spanName) .Where(x => (string)x.Attribute("class") == "ocr_line"); foreach (XElement line in lines) { TextLine textLine = new TextLine(); XAttribute coords = line.Attribute("title"); if (coords != null) { string[] coordlist = coords.Value.Split(' '); textLine.X = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[1])); textLine.Y = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[2])); textLine.Width = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[3])) - Convert.ToInt32(HelperOcr.GetNumbers(coordlist[1])); textLine.Height = Convert.ToInt32(HelperOcr.GetNumbers(coordlist[4])) - Convert.ToInt32(HelperOcr.GetNumbers(coordlist[2])); } XAttribute id = line.Attribute("id"); if (id != null) { textLine.id = id.Value; } List <TextWord> textWords = new DecodeWords().Decode(line); textLine.Words.AddRange(textWords); textLines.Add(textLine); } return(textLines); }