/// <summary> /// Build blocks via transitive closure. /// </summary> private static IEnumerable <TextBlock> GetLinesGroups(TextLine[] lines, double maxDist) { /************************************************************************************************** * We want to measure the distance between two lines using the following method: * We check if two lines are overlapping horizontally. * If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area. * We finally compute the Euclidean distance between these two middle points. * If the two lines are not overlapping, the distance is set to the max distance. * * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'line Height', the algo won't * work as the FindIndexNearest() function will always pair the pivot with itself (the pivot's top * point (distance = height) is closer than other lines' top point). * -> Solution would be to find more than one nearest neighbours. Use KDTree? **************************************************************************************************/ Func <PdfLine, PdfLine, double> euclidianOverlappingMiddleDistance = (l1, l2) => { var left = Math.Max(l1.Point1.X, l2.Point1.X); var d = (Math.Min(l1.Point2.X, l2.Point2.X) - left); if (d < 0) { return(double.MaxValue); // not overlapping -> max distance } return(Distances.Euclidean( new PdfPoint(left + d / 2, l1.Point1.Y), new PdfPoint(left + d / 2, l2.Point1.Y))); }; var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines, euclidianOverlappingMiddleDistance, (pivot, candidate) => maxDist, pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight), candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight), pivot => true, (pivot, candidate) => true).ToList(); for (int a = 0; a < groupedIndexes.Count(); a++) { yield return(new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList())); } }
private static IEnumerable <TextBlock> GetLinesGroups(TextLine[] lines, double maxDist, int maxDegreeOfParallelism) { /************************************************************************************************** * We want to measure the distance between two lines using the following method: * We check if two lines are overlapping horizontally. * If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area. * We finally compute the Euclidean distance between these two middle points. * If the two lines are not overlapping, the distance is set to the max distance. **************************************************************************************************/ Func <PdfLine, PdfLine, double> euclidianOverlappingMiddleDistance = (l1, l2) => { var left = Math.Max(l1.Point1.X, l2.Point1.X); var d = (Math.Min(l1.Point2.X, l2.Point2.X) - left); if (d < 0) { return(double.MaxValue); // not overlapping -> max distance } return(Distances.Euclidean( new PdfPoint(left + d / 2, l1.Point1.Y), new PdfPoint(left + d / 2, l2.Point1.Y))); }; var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(lines, euclidianOverlappingMiddleDistance, (pivot, candidate) => maxDist, pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight), candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight), pivot => true, (pivot, candidate) => true, maxDegreeOfParallelism).ToList(); for (int a = 0; a < groupedIndexes.Count(); a++) { yield return(new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList())); } }