private static GMTextBlock TransformTextBlock(GMTextBlock textBlock, Func <BoundingBox, BoundingBox> boxConverter) { var targetTextBlock = new GMTextBlock(boxConverter(textBlock.BoundingBox)); foreach (var paragraph in textBlock.Paragraphs()) { var targetParagraph = targetTextBlock.AddParagraph(boxConverter(paragraph.BoundingBox)); foreach (var line in paragraph.Lines()) { var targetLine = targetParagraph.AddLine(boxConverter(line.BoundingBox)); foreach (var word in line.Words()) { var targetWord = new GMWord(boxConverter(word.BoundingBox), word.Text, word.Accuracy); targetLine.AddWord(targetWord); } } } foreach (var word in textBlock.StandaloneWords()) { var targetWord = new GMWord(boxConverter(word.BoundingBox), word.Text, word.Accuracy); targetTextBlock.AddStandaloneWord(targetWord); } return(targetTextBlock); }
private static bool IsWordValidForAngleDetection(GMWord word) { var result = validWordPattern.IsMatch(word.Text); // if (!result) // { // if (!invalidWordPattern.IsMatch(word.Text) && word.Text.Trim().Length > 0) // { // Console.Out.WriteLine("Rejected word: \"{0}\"", word.Text); // } // } return(result); }
/// <summary> /// Assuming that firstWord and secondWord are representing bounding boxes for words of the same line /// (edges of bounding boxes are aligned to X and Y axes) whis method detects angle between /// line and positive direction of X-axis. /// Angle is detected by direction of line connecting the centers of given bounding rectangles /// </summary> /// <param name="firstWord"></param> /// <param name="secondWord"></param> /// <returns>Angle in radians</returns> private static double DetectAngle(GMWord firstWord, GMWord secondWord) { var firstBBox = firstWord.BoundingBox; var secondBBox = secondWord.BoundingBox; // Page coordinates have origin in top-left corner, // positive direction of Y-axis is oriented to down, // positive direction of X-axis is oriented to right Func <BoundingBox, int> selectorX = (bbox => (bbox.XMin + bbox.XMax)); Func <BoundingBox, int> selectorY = (bbox => (bbox.YMin + bbox.YMax)); // Here are coordinates of vector of text direction var dirX = selectorX(secondBBox) - selectorX(firstBBox); var dirY = selectorY(secondBBox) - selectorY(firstBBox); dirY = -dirY; // To invert Y-axis direction double angle = (dirX >= 0) ? 0 : (dirY > 0) ? Math.PI : -Math.PI; if (dirX < 0) { dirX = -dirX; dirY = -dirY; } if (-dirX <= dirY && dirY <= dirX) { angle += Math.Atan(dirY / (double)dirX); } else if (dirY > 0) { angle += Math.PI / 2 - Math.Atan(dirX / (double)dirY); } else { angle += -Math.PI / 2 + Math.Atan(dirX / (double)dirY); } return(angle); }
private static GMTextBlock RemoveEmptyElements(this GMTextBlock textBlock) { var textBlockNew = new GMTextBlock(textBlock.BoundingBox); foreach (var paragraph in textBlock.Paragraphs()) { var paragraphNew = new GMParagraph(paragraph.BoundingBox); foreach (var line in paragraph.Lines()) { var lineNew = new GMLine(line.BoundingBox); foreach (var word in line.Words()) { if (word.Text.Trim().Length > 0) { var wordNew = new GMWord(word.BoundingBox, word.Text, word.Accuracy); lineNew.AddWord(wordNew); } } if (lineNew.Words().Any()) { paragraphNew.AddLine(lineNew); } } if (paragraphNew.Lines().Any()) { textBlockNew.AddParagraph(paragraphNew); } foreach (var word in textBlock.StandaloneWords()) { if (word.Text.Trim().Length > 0) { var wordNew = new GMWord(word.BoundingBox, word.Text, word.Accuracy); textBlockNew.AddStandaloneWord(wordNew); } } } return(textBlockNew); }
public void AddWord(GMWord word) { words.Add(word); }
/// <summary> /// Alternative approach to angle detection: this method uses line connecting those vertices of word's bounding rectangles which /// are the closest to base line of text. /// </summary> /// <param name="firstWord"></param> /// <param name="secondWord"></param> /// <returns>Angle in radians</returns> private static double DetectAngle3(GMWord firstWord, GMWord secondWord) { var firstBBox = firstWord.BoundingBox; var secondBBox = secondWord.BoundingBox; Func <BoundingBox, int> selectorX; Func <BoundingBox, int> selectorY; // Page coordinates have origin in top-left corner, // positive direction of Y-axis is oriented to down, // positive direction of X-axis is oriented to right if (secondBBox.XMax < firstBBox.XMax) { // Page is rotated upside-down (words in line go from right to left) if (firstBBox.YMin >= secondBBox.YMin) { // Text goes from down-right corner to upper-left corner selectorX = (bbox => bbox.XMin); selectorY = (bbox => bbox.YMin); } else { // Text goes from upper-right corner to down-left corner selectorX = (bbox => bbox.XMax); selectorY = (bbox => bbox.YMin); } } else { // Page is properly rotated (words in linego from left to right) if (firstBBox.YMax >= secondBBox.YMax) { // Text goes from down-left corner to upper-right corner selectorX = (bbox => bbox.XMin); selectorY = (bbox => bbox.YMax); } else { // Text goes from upper-left corner to down-right corner selectorX = (bbox => bbox.XMax); selectorY = (bbox => bbox.YMax); } } // Here are coordinates of vector of text direction var dirX = selectorX(secondBBox) - selectorX(firstBBox); var dirY = selectorY(secondBBox) - selectorY(firstBBox); dirY = -dirY; var dirLength = Math.Sqrt(dirX * dirX + dirY * dirY); //Console.Out.WriteLine("{0:N4}:{1:N4}", dirX / dirLength, dirY / dirLength); double angle = 0; if (dirX < 0) { if (dirY > 0) { angle = Math.PI; } else { angle = -Math.PI; } dirX = -dirX; dirY = -dirY; } if (-dirX <= dirY && dirY <= dirX) { angle += Math.Atan(dirY / (double)dirX); } else if (dirY > 0) { angle += Math.PI / 2 - Math.Atan(dirX / (double)dirY); } else { angle += -Math.PI / 2 + Math.Atan(dirX / (double)dirY); } return(angle); }
public void AddStandaloneWord(GMWord word) { words.Add(word); }