private static IEnumerable <TextLine> GetLines(List <Word> words, double maxDist, AngleBounds withinLine)
        {
            TextDirection textDirection  = words[0].TextDirection;
            var           groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(words, Distances.Euclidean,
                                                                                         (pivot, candidate) => maxDist,
                                                                                         pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft,
                                                                                         pivot => true,
                                                                                         (pivot, candidate) =>
            {
                // Compare bottom right with bottom left for angle
                var withinLineAngle = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft);

                return(withinLineAngle >= withinLine.Lower && withinLineAngle <= withinLine.Upper);
            }).ToList();

            Func <IEnumerable <Word>, IReadOnlyList <Word> > orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();

            if (textDirection == TextDirection.Rotate180)
            {
                orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Right).ToList();
            }
            else if (textDirection == TextDirection.Rotate90)
            {
                orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Top).ToList();
            }
            else if (textDirection == TextDirection.Rotate270)
            {
                orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList();
            }

            for (var a = 0; a < groupedIndexes.Count; a++)
            {
                yield return(new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i]))));
            }
        }
Example #2
0
        /// <summary>
        /// Private method to get the words.
        /// </summary>
        /// <param name="pageLetters">The letters in the page, they must have
        /// the same text directions.</param>
        /// <param name="maxDistanceFunction">The function that determines the maximum distance between two Letters,
        /// e.g. Max(GlyphRectangle.Width) x 20%.</param>
        /// <param name="distMeasure">The distance measure between two start and end base line points,
        /// e.g. the Manhattan distance.</param>
        /// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
        /// <para>A positive property value limits the number of concurrent operations to the set value.
        /// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
        private List <Word> GetWords(IEnumerable <Letter> pageLetters,
                                     Func <Letter, Letter, double> maxDistanceFunction, Func <PdfPoint, PdfPoint, double> distMeasure,
                                     int maxDegreeOfParallelism)
        {
            if (pageLetters == null || pageLetters.Count() == 0)
            {
                return(new List <Word>());
            }
            TextDirection textDirection = pageLetters.ElementAt(0).TextDirection;

            if (pageLetters.Any(x => textDirection != x.TextDirection))
            {
                throw new ArgumentException("NearestNeighbourWordExtractor.GetWords(): Mixed Text Direction.");
            }

            Func <IEnumerable <Letter>, IReadOnlyList <Letter> > orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Left).ToList();

            if (textDirection == TextDirection.Rotate180)
            {
                orderFunc = l => l.OrderByDescending(x => x.GlyphRectangle.Right).ToList();
            }
            else if (textDirection == TextDirection.Rotate90)
            {
                orderFunc = l => l.OrderByDescending(x => x.GlyphRectangle.Top).ToList();
            }
            else if (textDirection == TextDirection.Rotate270)
            {
                orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Bottom).ToList();
            }

            Letter[] letters = pageLetters.ToArray();

            var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(letters,
                                                                               distMeasure, maxDistanceFunction,
                                                                               l => l.EndBaseLine, l => l.StartBaseLine,
                                                                               l => !string.IsNullOrWhiteSpace(l.Value),
                                                                               (l1, l2) => !string.IsNullOrWhiteSpace(l2.Value),
                                                                               maxDegreeOfParallelism).ToList();

            List <Word> words = new List <Word>();

            for (int a = 0; a < groupedIndexes.Count(); a++)
            {
                words.Add(new Word(orderFunc(groupedIndexes[a].Select(i => letters[i]))));
            }

            return(words);
        }
Example #3
0
        /// <summary>
        /// Private method to get the words.
        /// </summary>
        /// <param name="pageLetters">The letters in the page, they must have
        /// the same text directions.</param>
        /// <param name="metric">The letter's metric to use in the minimum distance
        /// between 2 letters, e.g. GlyphRectangle.Width or GlyphRectangle.Height.</param>
        /// <param name="distMeasure">The distance measure between two start and end base line points,
        /// e.g. the Manhattan distance.</param>
        private List <Word> GetWords(IEnumerable <Letter> pageLetters,
                                     Func <Letter, decimal> metric, Func <PdfPoint, PdfPoint, double> distMeasure)
        {
            if (pageLetters == null || pageLetters.Count() == 0)
            {
                return(new List <Word>());
            }
            TextDirection textDirection = pageLetters.ElementAt(0).TextDirection;

            if (pageLetters.Any(x => textDirection != x.TextDirection))
            {
                throw new ArgumentException("NNWordExtractor.GetWords(): Mixed Text Direction.");
            }

            Func <IEnumerable <Letter>, IReadOnlyList <Letter> > orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Left).ToList();

            if (textDirection == TextDirection.Rotate180)
            {
                orderFunc = l => l.OrderByDescending(x => x.GlyphRectangle.Right).ToList();
            }
            else if (textDirection == TextDirection.Rotate90)
            {
                orderFunc = l => l.OrderByDescending(x => x.GlyphRectangle.Top).ToList();
            }
            else if (textDirection == TextDirection.Rotate270)
            {
                orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Bottom).ToList();
            }

            Letter[] letters = pageLetters.ToArray();

            var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(letters,
                                                                              distMeasure,
                                                                              (l1, l2) => Math.Max((double)metric(l1), (double)metric(l2)) * 0.60,
                                                                              l => l.EndBaseLine, l => l.StartBaseLine,
                                                                              l => !string.IsNullOrWhiteSpace(l.Value),
                                                                              (l1, l2) => string.Equals(l1.FontName, l2.FontName, StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(l2.Value)).ToList();

            List <Word> words = new List <Word>();

            for (int a = 0; a < groupedIndexes.Count(); a++)
            {
                words.Add(new Word(orderFunc(groupedIndexes[a].Select(i => letters[i]))));
            }

            return(words);
        }
        /// <summary>
        /// Build lines via transitive closure.
        /// </summary>
        private static IEnumerable <TextLine> GetLines(List <Word> words, double maxDist, AngleBounds withinLine)
        {
            /***************************************************************************************************
            * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'word Width', the algo might not
            * work as the FindIndexNearest() function might pair the pivot with itself (the pivot's right point
            * (distance = width) is closer than other words' left point).
            * -> Solution would be to find more than one nearest neighbours. Use KDTree?
            ***************************************************************************************************/

            TextDirection textDirection  = words[0].TextDirection;
            var           groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(words, Distances.Euclidean,
                                                                                        (pivot, candidate) => maxDist,
                                                                                        pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft,
                                                                                        pivot => true,
                                                                                        (pivot, candidate) =>
            {
                // Compare bottom right with bottom left for angle
                var withinLineAngle = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft);

                return(withinLineAngle >= withinLine.Lower && withinLineAngle <= withinLine.Upper);
            }).ToList();

            Func <IEnumerable <Word>, IReadOnlyList <Word> > orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();

            if (textDirection == TextDirection.Rotate180)
            {
                orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Right).ToList();
            }
            else if (textDirection == TextDirection.Rotate90)
            {
                orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Top).ToList();
            }
            else if (textDirection == TextDirection.Rotate270)
            {
                orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList();
            }

            for (var a = 0; a < groupedIndexes.Count; a++)
            {
                yield return(new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i]))));
            }
        }
        /// <summary>
        /// Build blocks via transitive closure.
        /// </summary>
        private static IEnumerable <TextBlock> GetLinesGroups(TextLine[] lines, double maxDist)
        {
            /**************************************************************************************************
            * We want to measure the distance between two lines using the following method:
            *  We check if two lines are overlapping horizontally.
            *  If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area.
            *  We finally compute the Euclidean distance between these two middle points.
            *  If the two lines are not overlapping, the distance is set to the max distance.
            *
            * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'line Height', the algo won't
            * work as the FindIndexNearest() function will always pair the pivot with itself (the pivot's top
            * point (distance = height) is closer than other lines' top point).
            * -> Solution would be to find more than one nearest neighbours. Use KDTree?
            **************************************************************************************************/

            Func <PdfLine, PdfLine, double> euclidianOverlappingMiddleDistance = (l1, l2) =>
            {
                var left = Math.Max(l1.Point1.X, l2.Point1.X);
                var d    = (Math.Min(l1.Point2.X, l2.Point2.X) - left);

                if (d < 0)
                {
                    return(double.MaxValue);       // not overlapping -> max distance
                }
                return(Distances.Euclidean(
                           new PdfPoint(left + d / 2, l1.Point1.Y),
                           new PdfPoint(left + d / 2, l2.Point1.Y)));
            };

            var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines,
                                                                              euclidianOverlappingMiddleDistance,
                                                                              (pivot, candidate) => maxDist,
                                                                              pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
                                                                              candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
                                                                              pivot => true, (pivot, candidate) => true).ToList();

            for (int a = 0; a < groupedIndexes.Count(); a++)
            {
                yield return(new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList()));
            }
        }
Example #6
0
        private static IEnumerable <TextBlock> GetLinesGroups(TextLine[] lines, double maxDist, int maxDegreeOfParallelism)
        {
            /**************************************************************************************************
            * We want to measure the distance between two lines using the following method:
            *  We check if two lines are overlapping horizontally.
            *  If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area.
            *  We finally compute the Euclidean distance between these two middle points.
            *  If the two lines are not overlapping, the distance is set to the max distance.
            **************************************************************************************************/

            Func <PdfLine, PdfLine, double> euclidianOverlappingMiddleDistance = (l1, l2) =>
            {
                var left = Math.Max(l1.Point1.X, l2.Point1.X);
                var d    = (Math.Min(l1.Point2.X, l2.Point2.X) - left);

                if (d < 0)
                {
                    return(double.MaxValue);       // not overlapping -> max distance
                }
                return(Distances.Euclidean(
                           new PdfPoint(left + d / 2, l1.Point1.Y),
                           new PdfPoint(left + d / 2, l2.Point1.Y)));
            };

            var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(lines,
                                                                               euclidianOverlappingMiddleDistance,
                                                                               (pivot, candidate) => maxDist,
                                                                               pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
                                                                               candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
                                                                               pivot => true, (pivot, candidate) => true,
                                                                               maxDegreeOfParallelism).ToList();

            for (int a = 0; a < groupedIndexes.Count(); a++)
            {
                yield return(new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList()));
            }
        }