コード例 #1
0
        /// <summary>
        /// Build blocks via transitive closure.
        /// </summary>
        private static IEnumerable <TextBlock> GetLinesGroups(TextLine[] lines, double maxDist)
        {
            /**************************************************************************************************
            * We want to measure the distance between two lines using the following method:
            *  We check if two lines are overlapping horizontally.
            *  If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area.
            *  We finally compute the Euclidean distance between these two middle points.
            *  If the two lines are not overlapping, the distance is set to the max distance.
            *
            * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'line Height', the algo won't
            * work as the FindIndexNearest() function will always pair the pivot with itself (the pivot's top
            * point (distance = height) is closer than other lines' top point).
            * -> Solution would be to find more than one nearest neighbours. Use KDTree?
            **************************************************************************************************/

            Func <PdfLine, PdfLine, double> euclidianOverlappingMiddleDistance = (l1, l2) =>
            {
                var left = Math.Max(l1.Point1.X, l2.Point1.X);
                var d    = (Math.Min(l1.Point2.X, l2.Point2.X) - left);

                if (d < 0)
                {
                    return(double.MaxValue);       // not overlapping -> max distance
                }
                return(Distances.Euclidean(
                           new PdfPoint(left + d / 2, l1.Point1.Y),
                           new PdfPoint(left + d / 2, l2.Point1.Y)));
            };

            var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines,
                                                                              euclidianOverlappingMiddleDistance,
                                                                              (pivot, candidate) => maxDist,
                                                                              pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
                                                                              candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
                                                                              pivot => true, (pivot, candidate) => true).ToList();

            for (int a = 0; a < groupedIndexes.Count(); a++)
            {
                yield return(new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList()));
            }
        }
コード例 #2
0
        private static IEnumerable <TextBlock> GetLinesGroups(TextLine[] lines, double maxDist, int maxDegreeOfParallelism)
        {
            /**************************************************************************************************
            * We want to measure the distance between two lines using the following method:
            *  We check if two lines are overlapping horizontally.
            *  If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area.
            *  We finally compute the Euclidean distance between these two middle points.
            *  If the two lines are not overlapping, the distance is set to the max distance.
            **************************************************************************************************/

            Func <PdfLine, PdfLine, double> euclidianOverlappingMiddleDistance = (l1, l2) =>
            {
                var left = Math.Max(l1.Point1.X, l2.Point1.X);
                var d    = (Math.Min(l1.Point2.X, l2.Point2.X) - left);

                if (d < 0)
                {
                    return(double.MaxValue);       // not overlapping -> max distance
                }
                return(Distances.Euclidean(
                           new PdfPoint(left + d / 2, l1.Point1.Y),
                           new PdfPoint(left + d / 2, l2.Point1.Y)));
            };

            var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(lines,
                                                                               euclidianOverlappingMiddleDistance,
                                                                               (pivot, candidate) => maxDist,
                                                                               pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
                                                                               candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
                                                                               pivot => true, (pivot, candidate) => true,
                                                                               maxDegreeOfParallelism).ToList();

            for (int a = 0; a < groupedIndexes.Count(); a++)
            {
                yield return(new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList()));
            }
        }