/// <summary> /// Get information on the nearest point, filtered for angle. /// </summary> /// <param name="words"></param> /// <param name="pivot"></param> /// <param name="funcPivotDist"></param> /// <param name="funcPivotAngle"></param> /// <param name="funcPointsDist"></param> /// <param name="funcPointsAngle"></param> /// <param name="angleStart"></param> /// <param name="angleEnd"></param> /// <param name="finalDistMEasure"></param> /// <returns></returns> private double[] GetNearestPointData(Word[] words, Word pivot, Func <PdfRectangle, PdfPoint> funcPivotDist, Func <PdfRectangle, PdfPoint> funcPivotAngle, Func <PdfRectangle, PdfPoint> funcPointsDist, Func <PdfRectangle, PdfPoint> funcPointsAngle, double angleStart, double angleEnd, Func <PdfPoint, PdfPoint, double> finalDistMEasure) { var pointR = funcPivotDist(pivot.BoundingBox); // Filter by angle var filtered = words.Where(w => { var angleWL = Distances.Angle(funcPivotAngle(pivot.BoundingBox), funcPointsAngle(w.BoundingBox)); return(angleWL >= angleStart && angleWL <= angleEnd); }).ToList(); filtered.Remove(pivot); // remove itself if (filtered.Count > 0) { int index = pointR.FindIndexNearest( filtered.Select(w => funcPointsDist(w.BoundingBox)).ToList(), Distances.Euclidean, out double distWL); if (index >= 0) { var matchWL = filtered[index]; return(new double[] { (double)pivot.Letters.Select(l => l.FontSize).Mode(), finalDistMEasure(pointR, funcPointsDist(matchWL.BoundingBox)) }); } } return(null); }
private static IEnumerable <TextLine> GetLines(List <Word> words, double maxDist, AngleBounds withinLine) { TextDirection textDirection = words[0].TextDirection; var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(words, Distances.Euclidean, (pivot, candidate) => maxDist, pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft, pivot => true, (pivot, candidate) => { // Compare bottom right with bottom left for angle var withinLineAngle = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft); return(withinLineAngle >= withinLine.Lower && withinLineAngle <= withinLine.Upper); }).ToList(); Func <IEnumerable <Word>, IReadOnlyList <Word> > orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList(); if (textDirection == TextDirection.Rotate180) { orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Right).ToList(); } else if (textDirection == TextDirection.Rotate90) { orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Top).ToList(); } else if (textDirection == TextDirection.Rotate270) { orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList(); } for (var a = 0; a < groupedIndexes.Count; a++) { yield return(new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i])))); } }
public PdfRectangle GetPivot() { int indexMiddle = Distances.FindIndexNearest(Bound.Centroid, Obstacles.Select(o => o.Centroid).ToList(), p => p, p => p, Distances.Euclidean, out double d); return(indexMiddle == -1 ? Obstacles.First() : Obstacles.ElementAt(indexMiddle)); }
private PdfRectangle GetBoundingBoxOther(IReadOnlyList <TextLine> lines) { var points = lines.SelectMany(l => new[] { l.BoundingBox.BottomLeft, l.BoundingBox.BottomRight, l.BoundingBox.TopLeft, l.BoundingBox.TopRight }); // Candidates bounding boxes var obb = Geometry.GeometryExtensions.MinimumAreaRectangle(points); var obb1 = new PdfRectangle(obb.BottomLeft, obb.TopLeft, obb.BottomRight, obb.TopRight); var obb2 = new PdfRectangle(obb.BottomRight, obb.BottomLeft, obb.TopRight, obb.TopLeft); var obb3 = new PdfRectangle(obb.TopRight, obb.BottomRight, obb.TopLeft, obb.BottomLeft); // Find the orientation of the OBB, using the baseline angle // Assumes line order is correct var lastLine = lines[lines.Count - 1]; var baseLineAngle = Distances.BoundAngle180(Distances.Angle(lastLine.BoundingBox.BottomLeft, lastLine.BoundingBox.BottomRight)); double deltaAngle = Math.Abs(Distances.BoundAngle180(obb.Rotation - baseLineAngle)); double deltaAngle1 = Math.Abs(Distances.BoundAngle180(obb1.Rotation - baseLineAngle)); if (deltaAngle1 < deltaAngle) { deltaAngle = deltaAngle1; obb = obb1; } double deltaAngle2 = Math.Abs(Distances.BoundAngle180(obb2.Rotation - baseLineAngle)); if (deltaAngle2 < deltaAngle) { deltaAngle = deltaAngle2; obb = obb2; } double deltaAngle3 = Math.Abs(Distances.BoundAngle180(obb3.Rotation - baseLineAngle)); if (deltaAngle3 < deltaAngle) { obb = obb3; } return(obb); }
/// <summary> /// Get information on the nearest point, filtered for angle. /// </summary> private double?GetNearestPointDistance(List <Word> words, Word pivot, Func <PdfRectangle, PdfPoint> funcPivotDist, Func <PdfRectangle, PdfPoint> funcPivotAngle, Func <PdfRectangle, PdfPoint> funcPointsDist, Func <PdfRectangle, PdfPoint> funcPointsAngle, AngleBounds angleBounds, Func <PdfPoint, PdfPoint, double> finalDistanceMeasure) { var pointR = funcPivotDist(pivot.BoundingBox); var pivotPoint = funcPivotAngle(pivot.BoundingBox); var wordsWithinAngleBoundDistancePoints = new List <PdfPoint>(); // Filter to words within the angle range. foreach (var word in words) { // Ignore the pivot word. if (ReferenceEquals(word, pivot)) { continue; } var angle = Distances.Angle(pivotPoint, funcPointsAngle(word.BoundingBox)); if (angleBounds.Contains(angle)) { wordsWithinAngleBoundDistancePoints.Add(funcPointsDist(word.BoundingBox)); } } if (wordsWithinAngleBoundDistancePoints.Count == 0) { return(null); } var closestWordIndex = pointR.FindIndexNearest(wordsWithinAngleBoundDistancePoints, p => p, p => p, Distances.Euclidean, out _); if (closestWordIndex < 0 || closestWordIndex >= wordsWithinAngleBoundDistancePoints.Count) { return(null); } return(finalDistanceMeasure(pointR, wordsWithinAngleBoundDistancePoints[closestWordIndex])); }
/// <summary> /// Build lines via transitive closure. /// </summary> private static IEnumerable <TextLine> GetLines(List <Word> words, double maxDist, AngleBounds withinLine) { /*************************************************************************************************** * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'word Width', the algo might not * work as the FindIndexNearest() function might pair the pivot with itself (the pivot's right point * (distance = width) is closer than other words' left point). * -> Solution would be to find more than one nearest neighbours. Use KDTree? ***************************************************************************************************/ TextDirection textDirection = words[0].TextDirection; var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(words, Distances.Euclidean, (pivot, candidate) => maxDist, pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft, pivot => true, (pivot, candidate) => { // Compare bottom right with bottom left for angle var withinLineAngle = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft); return(withinLineAngle >= withinLine.Lower && withinLineAngle <= withinLine.Upper); }).ToList(); Func <IEnumerable <Word>, IReadOnlyList <Word> > orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList(); if (textDirection == TextDirection.Rotate180) { orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Right).ToList(); } else if (textDirection == TextDirection.Rotate90) { orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Top).ToList(); } else if (textDirection == TextDirection.Rotate270) { orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList(); } for (var a = 0; a < groupedIndexes.Count; a++) { yield return(new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i])))); } }
/// <summary> /// Build blocks via transitive closure. /// </summary> private static IEnumerable <TextBlock> GetLinesGroups(TextLine[] lines, double maxDist) { /************************************************************************************************** * We want to measure the distance between two lines using the following method: * We check if two lines are overlapping horizontally. * If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area. * We finally compute the Euclidean distance between these two middle points. * If the two lines are not overlapping, the distance is set to the max distance. * * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'line Height', the algo won't * work as the FindIndexNearest() function will always pair the pivot with itself (the pivot's top * point (distance = height) is closer than other lines' top point). * -> Solution would be to find more than one nearest neighbours. Use KDTree? **************************************************************************************************/ Func <PdfLine, PdfLine, double> euclidianOverlappingMiddleDistance = (l1, l2) => { var left = Math.Max(l1.Point1.X, l2.Point1.X); var d = (Math.Min(l1.Point2.X, l2.Point2.X) - left); if (d < 0) { return(double.MaxValue); // not overlapping -> max distance } return(Distances.Euclidean( new PdfPoint(left + d / 2, l1.Point1.Y), new PdfPoint(left + d / 2, l2.Point1.Y))); }; var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines, euclidianOverlappingMiddleDistance, (pivot, candidate) => maxDist, pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight), candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight), pivot => true, (pivot, candidate) => true).ToList(); for (int a = 0; a < groupedIndexes.Count(); a++) { yield return(new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList())); } }
private static IEnumerable <TextBlock> GetLinesGroups(TextLine[] lines, double maxDist, int maxDegreeOfParallelism) { /************************************************************************************************** * We want to measure the distance between two lines using the following method: * We check if two lines are overlapping horizontally. * If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area. * We finally compute the Euclidean distance between these two middle points. * If the two lines are not overlapping, the distance is set to the max distance. **************************************************************************************************/ Func <PdfLine, PdfLine, double> euclidianOverlappingMiddleDistance = (l1, l2) => { var left = Math.Max(l1.Point1.X, l2.Point1.X); var d = (Math.Min(l1.Point2.X, l2.Point2.X) - left); if (d < 0) { return(double.MaxValue); // not overlapping -> max distance } return(Distances.Euclidean( new PdfPoint(left + d / 2, l1.Point1.Y), new PdfPoint(left + d / 2, l2.Point1.Y))); }; var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(lines, euclidianOverlappingMiddleDistance, (pivot, candidate) => maxDist, pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight), candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight), pivot => true, (pivot, candidate) => true, maxDegreeOfParallelism).ToList(); for (int a = 0; a < groupedIndexes.Count(); a++) { yield return(new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList())); } }
private static PdfRectangle GetBoundingBoxOther(IReadOnlyList <Word> words) { var baseLinePoints = words.SelectMany(r => new[] { r.BoundingBox.BottomLeft, r.BoundingBox.BottomRight, }).ToList(); // Fitting a line through the base lines points // to find the orientation (slope) double x0 = baseLinePoints.Average(p => p.X); double y0 = baseLinePoints.Average(p => p.Y); double sumProduct = 0; double sumDiffSquaredX = 0; for (int i = 0; i < baseLinePoints.Count; i++) { var point = baseLinePoints[i]; var x_diff = point.X - x0; var y_diff = point.Y - y0; sumProduct += x_diff * y_diff; sumDiffSquaredX += x_diff * x_diff; } double cos = 0; double sin = 1; if (sumDiffSquaredX > 1e-3) { // not a vertical line double angleRad = Math.Atan(sumProduct / sumDiffSquaredX); // -π/2 ≤ θ ≤ π/2 cos = Math.Cos(angleRad); sin = Math.Sin(angleRad); } // Rotate the points to build the axis-aligned bounding box (AABB) var inverseRotation = new TransformationMatrix( cos, -sin, 0, sin, cos, 0, 0, 0, 1); var transformedPoints = words.SelectMany(r => new[] { r.BoundingBox.BottomLeft, r.BoundingBox.BottomRight, r.BoundingBox.TopLeft, r.BoundingBox.TopRight }).Distinct().Select(p => inverseRotation.Transform(p)); var aabb = new PdfRectangle(transformedPoints.Min(p => p.X), transformedPoints.Min(p => p.Y), transformedPoints.Max(p => p.X), transformedPoints.Max(p => p.Y)); // Rotate back the AABB to obtain to oriented bounding box (OBB) var rotateBack = new TransformationMatrix( cos, sin, 0, -sin, cos, 0, 0, 0, 1); // Candidates bounding boxes var obb = rotateBack.Transform(aabb); var obb1 = new PdfRectangle(obb.BottomLeft, obb.TopLeft, obb.BottomRight, obb.TopRight); var obb2 = new PdfRectangle(obb.BottomRight, obb.BottomLeft, obb.TopRight, obb.TopLeft); var obb3 = new PdfRectangle(obb.TopRight, obb.BottomRight, obb.TopLeft, obb.BottomLeft); // Find the orientation of the OBB, using the baseline angle // Assumes word order is correct var firstWord = words[0]; var lastWord = words[words.Count - 1]; var baseLineAngle = Distances.Angle(firstWord.BoundingBox.BottomLeft, lastWord.BoundingBox.BottomRight); double deltaAngle = Math.Abs(Distances.BoundAngle180(obb.Rotation - baseLineAngle)); double deltaAngle1 = Math.Abs(Distances.BoundAngle180(obb1.Rotation - baseLineAngle)); if (deltaAngle1 < deltaAngle) { deltaAngle = deltaAngle1; obb = obb1; } double deltaAngle2 = Math.Abs(Distances.BoundAngle180(obb2.Rotation - baseLineAngle)); if (deltaAngle2 < deltaAngle) { deltaAngle = deltaAngle2; obb = obb2; } double deltaAngle3 = Math.Abs(Distances.BoundAngle180(obb3.Rotation - baseLineAngle)); if (deltaAngle3 < deltaAngle) { obb = obb3; } return(obb); }