private static void ParseTables(XElement pageElement, TextGeometryModel model) { foreach (var tableElement in pageElement.TableElements()) { var table = tableElement.ParseTable(); model.AddTable(table); } }
private static void ParseSeparators(XElement pageElement, TextGeometryModel model) { foreach (var separatorElement in pageElement.SeparatorElements()) { var separator = separatorElement.ParseSeparator(); model.AddSeparator(separator); } }
public HashSet <GMWord> GetSameWords([NotNull] TextGeometryModel fineReaderModel, [NotNull] TextGeometryModel tesseractModel) { const double targetResolution = 150; var scaledFrModel = fineReaderModel.ScaleModel(targetResolution / fineReaderModel.GridUnit.Divisor); var scaledTesModel = tesseractModel.ScaleModel(targetResolution / tesseractModel.GridUnit.Divisor); var tesseractCache = new LinesHashInGeometryModel(scaledTesModel.Words(), yEpsilon, xEpsilon, scaledTesModel.PageBox); var allSameWords = scaledFrModel.Words().Where(x => tesseractCache.Contains(x, String.Compare)); return(new HashSet <GMWord>(allSameWords)); }
public TextGeometryModel GetSampleModel(TextGeometryModel fineReaderModel, TextGeometryModel tesseractModel, [NotNull] GridUnit gridUnit) { var scaledFrModel = fineReaderModel.ScaleModel(gridUnit.Divisor / (double)fineReaderModel.GridUnit.Divisor); var scaledTesModel = tesseractModel.ScaleModel(gridUnit.Divisor / (double)tesseractModel.GridUnit.Divisor); var tesseractCache = new LinesHashInGeometryModel(scaledTesModel.Words(), yEpsilon, xEpsilon, scaledTesModel.PageBox); var sampleModel = new TextGeometryModel(scaledFrModel.PageBox, gridUnit); var sameWords = new List <GMWord>(EnumerateWordsFromFineReader(scaledFrModel, sampleModel, tesseractCache)); AddWordsFromTesseract(scaledTesModel, sampleModel, sameWords); return(sampleModel); }
private static void ParseTextBlocks(XElement pageElement, TextGeometryModel model) { foreach (var textBlockElement in pageElement.TextBlockElements()) { var textBlock = textBlockElement.ParseTextBlock(); if (!textBlock.IsEmpty()) { model.AddTextBlock(textBlock); } } }
//return HashSet coincidered words private static IEnumerable <GMWord> EnumerateWordsFromFineReader(TextGeometryModel fineReaderModel, TextGeometryModel sampleModel, LinesHashInGeometryModel tesseractCache) { foreach (var block in fineReaderModel.TextBlocks()) { var blockBox = block.BoundingBox; var thisBlock = sampleModel.AddTextBlock(new BoundingBox(blockBox.XMin, blockBox.YMin, blockBox.XMax, blockBox.YMax)); foreach (var paragraph in block.Paragraphs()) { var paragraphBox = paragraph.BoundingBox; var thisParagraph = thisBlock.AddParagraph(new BoundingBox(paragraphBox.XMin, paragraphBox.YMin, paragraphBox.XMax, paragraphBox.YMax)); foreach (var line in paragraph.Lines()) { var lineBox = line.BoundingBox; var thisLine = thisParagraph.AddLine(new BoundingBox(lineBox.XMin, lineBox.YMin, lineBox.XMax, lineBox.YMax)); foreach (var word in line.Words()) { var wordBox = word.BoundingBox; var accuracy = 50; if (tesseractCache.Contains(word, String.Compare)) { yield return(word); accuracy = 100; } var thisWord = new GMWord(wordBox, word.Text, accuracy); thisLine.AddWord(thisWord); } } } foreach (var word in block.StandaloneWords()) { var wordBox = word.BoundingBox; var accuracy = 50; if (tesseractCache.Contains(word, String.Compare)) { yield return(word); accuracy = 100; } var thisWord = new GMWord(wordBox, word.Text, accuracy); thisBlock.AddStandaloneWord(thisWord); } } }
public static void StoreModel(TextGeometryModel geometryModel, string targetPath, string targetFileName) { if (string.IsNullOrEmpty(targetPath)) { targetPath = "."; } StoreHOCRVisualizerScripts(targetPath); var xmlDoc = ProcessModel(geometryModel, "./"); using ( var stream = File.Open(Path.Combine(targetPath, targetFileName), FileMode.Create, FileAccess.Write, FileShare.Read)) { StoreXmlDoc(xmlDoc, stream); } }
private static TextGeometryModel ParsePage([NotNull] this XElement pageElement, [CanBeNull] GridUnit defaultGridUnit) { var boundingBox = pageElement.ParseBBoxAttribute(); var gridUnit = pageElement.ParseGridUnitAttribute(); if (GridUnit.UNKNOWN_UNITS.Equals(gridUnit) && defaultGridUnit != null) { gridUnit = defaultGridUnit; } var model = new TextGeometryModel(boundingBox, gridUnit); ParseTextBlocks(pageElement, model); ParseSeparators(pageElement, model); ParseTables(pageElement, model); return(model); }
private static void AddWordsFromTesseract(TextGeometryModel tesseractModel, TextGeometryModel sampleModel, List <GMWord> set) { var box = new BoundingBox(tesseractModel.PageBox.XMin, tesseractModel.PageBox.YMin, tesseractModel.PageBox.XMax, tesseractModel.PageBox.YMax); var block = sampleModel.AddTextBlock(box); var paragraph = block.AddParagraph(box); var line = paragraph.AddLine(box); foreach (var word in tesseractModel.Words()) { //compare with special comparator, which compare boundingboxes with inaccuracy if (!set.Contains(word, GMWordComparerWithInaccuracy.Instance)) { var wordBox = new BoundingBox(word.BoundingBox.XMin, word.BoundingBox.YMin, word.BoundingBox.XMax, word.BoundingBox.YMax); line.AddWord(new GMWord(wordBox, word.Text, 50)); } } }
/// <summary> /// Creates new instance of recognition results assuming that results are not deskewed /// </summary> /// <param name="geometryModel"></param> public RecognitionResult(TextGeometryModel geometryModel) { this.geometryModel = geometryModel; deskewParameters = null; isDeskewed = false; }
/// <summary> /// Creates new instance of recognition results assuming that results are deskewed with given transformation /// </summary> /// <param name="geometryModel">The model to store</param> /// <param name="deskewParameters">The set of deskew transformation parameters</param> public RecognitionResult([NotNull] TextGeometryModel geometryModel, [NotNull] DeskewParameters deskewParameters) { this.geometryModel = geometryModel; this.deskewParameters = deskewParameters; isDeskewed = true; }
public static XDocument ProcessModel([NotNull] TextGeometryModel model, string pathToScripts = "") { var bodyElement = new XElement(HocrFormatConsts.XNameBody); var document = new XDocument( new XElement(HocrFormatConsts.XNameHtml, new XElement(HocrFormatConsts.XNameHead, new XElement(HocrFormatConsts.XNameTitle, "text geometric model"), new XElement(HocrFormatConsts.XNameLink, new XAttribute("href", pathToScripts + "hocr.css"), new XAttribute("rel", "stylesheet")), new XElement(HocrFormatConsts.XNameScript, new XAttribute("src", pathToScripts + "jquery.js"), " "), new XElement(HocrFormatConsts.XNameScript, new XAttribute("src", pathToScripts + "hocr.js"), " "), MakeXElementWithAttributes(HocrFormatConsts.XNameMeta, new Dictionary <string, string>() { { "http-equiv", "Content-Type" }, { "content", "text/html; charset=utf-8" } }), MakeMetaElement("ocr-system", "kontur-recognition"), MakeMetaElement("ocr-capabilities", "ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf ocr_separator ocr_table") ), bodyElement ) ); var pageElement = MakeXElementWithAttributes( HocrFormatConsts.XNameDiv, new Dictionary <string, string>() { { "class", HocrFormatConsts.ClassPage }, { "id", "page_1" }, { "title", string.Format("{0}; ppageno 0", FormatTitleAttribute(model.PageBox)) }, { "gridunit", string.Format("{0}", model.GridUnit.Divisor) } }); bodyElement.Add(pageElement); var idCounter = new IdCounter(); foreach (var textBlock in model.TextBlocks()) { pageElement.Add(MakeTextBlockElement(textBlock, idCounter)); } foreach (var separator in model.Separators()) { var separatorElement = MakeXElementWithAttributes( HocrFormatConsts.XNameDiv, new Dictionary <string, string>() { { "class", HocrFormatConsts.ClassSeparator }, { "id", idCounter.NextSeparatorId }, { "title", FormatTitleAttribute(separator.BoundingBox) }, { "separator", string.Format("{0} {1} {2} {3} {4}", separator.StartPointX, separator.StartPointY, separator.EndPointX, separator.EndPointY, separator.Width) }, }); pageElement.Add(separatorElement); } foreach (GMTable table in model.Tables()) { var tableElement = MakeXElementWithAttributes( HocrFormatConsts.XNameDiv, new Dictionary <string, string>() { { "class", HocrFormatConsts.ClassTable }, { "id", idCounter.NextTableId }, { "title", FormatTitleAttribute(table.BoundingBox) }, { HocrFormatConsts.AttrTableRowsCount, table.RowsCount.ToString() }, { HocrFormatConsts.AttrTableColsCount, table.ColsCount.ToString() } }); foreach (var cell in table.Cells()) { tableElement.Add(MakeCellElement(cell, idCounter)); } pageElement.Add(tableElement); } return(document); }
public static void StoreModel(TextGeometryModel geometryModel, Stream target) { var xmlDoc = ProcessModel(geometryModel, "./"); StoreXmlDoc(xmlDoc, target); }