Пример #1
0
        private static IEnumerable<TextLine> GetLines(List<Word> words, double maxDist, AngleBounds withinLine, int maxDegreeOfParallelism)
        {
            TextDirection textDirection = words[0].TextDirection;
            var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(words, 2, Distances.Euclidean,
                    (pivot, candidate) => maxDist,
                    pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft,
                    pivot => true,
                    (pivot, candidate) => withinLine.Contains(Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft)),
                    maxDegreeOfParallelism).ToList();

            Func<IEnumerable<Word>, IReadOnlyList<Word>> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();
            if (textDirection == TextDirection.Rotate180)
            {
                orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Right).ToList();
            }
            else if (textDirection == TextDirection.Rotate90)
            {
                orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Top).ToList();
            }
            else if (textDirection == TextDirection.Rotate270)
            {
                orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList();
            }

            for (var a = 0; a < groupedIndexes.Count; a++)
            {
                yield return new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i])));
            }
        }
Пример #2
0
        // NEED TO IMPLEMENT ROUNDING
        private static PdfRectangle GetBoundingBoxOther(Letter letter)
        {
            // not very useful, need axis aligned bbox anyway
            // -> rotate back? or normalise?
            var points = new[]
            {
                letter.StartBaseLine,
                letter.EndBaseLine,
                letter.GlyphRectangle.TopLeft,
                letter.GlyphRectangle.TopRight
            };

            // Candidates bounding boxes
            var obb  = GeometryExtensions.MinimumAreaRectangle(points);
            var obb1 = new PdfRectangle(obb.BottomLeft, obb.TopLeft, obb.BottomRight, obb.TopRight);
            var obb2 = new PdfRectangle(obb.BottomRight, obb.BottomLeft, obb.TopRight, obb.TopLeft);
            var obb3 = new PdfRectangle(obb.TopRight, obb.BottomRight, obb.TopLeft, obb.BottomLeft);

            // Find the orientation of the OBB, using the baseline angle
            // Assumes line order is correct

            var baseLineAngle = Distances.BoundAngle180(Distances.Angle(letter.GlyphRectangle.BottomLeft, letter.GlyphRectangle.BottomRight));

            double deltaAngle  = Math.Abs(Distances.BoundAngle180(obb.Rotation - baseLineAngle));
            double deltaAngle1 = Math.Abs(Distances.BoundAngle180(obb1.Rotation - baseLineAngle));

            if (deltaAngle1 < deltaAngle)
            {
                deltaAngle = deltaAngle1;
                obb        = obb1;
            }

            double deltaAngle2 = Math.Abs(Distances.BoundAngle180(obb2.Rotation - baseLineAngle));

            if (deltaAngle2 < deltaAngle)
            {
                deltaAngle = deltaAngle2;
                obb        = obb2;
            }

            double deltaAngle3 = Math.Abs(Distances.BoundAngle180(obb3.Rotation - baseLineAngle));

            if (deltaAngle3 < deltaAngle)
            {
                obb = obb3;
            }

            return(obb);
        }
Пример #3
0
        /// <summary>
        /// Helper function to compute the within line angle between the pivot's bottom
        /// right and the candidate's bottom left points, taking in account the pivot's rotation.
        /// <para>-90 ≤ θ ≤ 90.</para>
        /// </summary>
        private static double AngleWL(Word pivot, Word candidate)
        {
            var angle = Distances.BoundAngle180(Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft) - pivot.BoundingBox.Rotation);

            // Angle is kept within [-90;90] degree to handle overlapping words
            if (angle > 90)
            {
                angle -= 180;
            }
            else if (angle < -90)
            {
                angle += 180;
            }

            return(angle);
        }
Пример #4
0
        /// <summary>
        /// Get information on the nearest point, filtered for angle.
        /// </summary>
        private double?GetNearestPointDistance(List <Word> words, Word pivot, Func <PdfRectangle,
                                                                                    PdfPoint> funcPivotDist, Func <PdfRectangle, PdfPoint> funcPivotAngle,
                                               Func <PdfRectangle, PdfPoint> funcPointsDist, Func <PdfRectangle, PdfPoint> funcPointsAngle,
                                               AngleBounds angleBounds,
                                               Func <PdfPoint, PdfPoint, double> finalDistanceMeasure)
        {
            var pointR = funcPivotDist(pivot.BoundingBox);

            var pivotPoint = funcPivotAngle(pivot.BoundingBox);

            var wordsWithinAngleBoundDistancePoints = new List <PdfPoint>();

            // Filter to words within the angle range.
            foreach (var word in words)
            {
                // Ignore the pivot word.
                if (ReferenceEquals(word, pivot))
                {
                    continue;
                }

                var angle = Distances.Angle(pivotPoint, funcPointsAngle(word.BoundingBox));

                if (angleBounds.Contains(angle))
                {
                    wordsWithinAngleBoundDistancePoints.Add(funcPointsDist(word.BoundingBox));
                }
            }

            if (wordsWithinAngleBoundDistancePoints.Count == 0)
            {
                return(null);
            }

            var closestWordIndex = Distances.FindIndexNearest(pointR, wordsWithinAngleBoundDistancePoints, p => p,
                                                              p => p, Distances.Euclidean, out _);

            if (closestWordIndex < 0 || closestWordIndex >= wordsWithinAngleBoundDistancePoints.Count)
            {
                return(null);
            }

            return(finalDistanceMeasure(pointR, wordsWithinAngleBoundDistancePoints[closestWordIndex]));
        }
Пример #5
0
        /// <summary>
        /// Get the blocks. See original paper for more information.
        /// </summary>
        /// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
        /// <param name="withinLine">Angle bounds for words to be considered on the same line.</param>
        /// <param name="betweenLine">Angle bounds for words to be considered on separate lines.</param>
        /// <param name="betweenLineMultiplier">Multiplier that gives the maximum perpendicular distance between
        /// text lines for blocking. Maximum distance will be this number times the between-line
        /// distance found by the analysis.</param>
        /// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
        /// <para>A positive property value limits the number of concurrent operations to the set value.
        /// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
        /// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
        public IReadOnlyList <TextBlock> GetBlocks(IEnumerable <Word> words, AngleBounds withinLine,
                                                   AngleBounds betweenLine, double betweenLineMultiplier, int maxDegreeOfParallelism)
        {
            if (words == null)
            {
                return(EmptyArray <TextBlock> .Instance);
            }

            var wordsList = new List <Word>();

            foreach (var word in words)
            {
                if (string.IsNullOrWhiteSpace(word.Text))
                {
                    continue;
                }

                wordsList.Add(word);
            }

            if (wordsList.Count == 0)
            {
                return(EmptyArray <TextBlock> .Instance);
            }

            var withinLineDistList  = new ConcurrentBag <double>();
            var betweenLineDistList = new ConcurrentBag <double>();

            ParallelOptions parallelOptions = new ParallelOptions()
            {
                MaxDegreeOfParallelism = maxDegreeOfParallelism
            };

            // 1. Estimate within line and between line spacing
            KdTree <Word> kdTreeWL = new KdTree <Word>(wordsList, w => w.BoundingBox.BottomLeft);
            KdTree <Word> kdTreeBL = new KdTree <Word>(wordsList, w => w.BoundingBox.TopLeft);

            Parallel.For(0, wordsList.Count, parallelOptions, i =>
            {
                var word = wordsList[i];

                // Within-line distance
                var neighbourWL = kdTreeWL.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomRight, (p1, p2) => Distances.WeightedEuclidean(p1, p2, 0.5));
                foreach (var n in neighbourWL)
                {
                    if (withinLine.Contains(Distances.Angle(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft)))
                    {
                        withinLineDistList.Add(Distances.Horizontal(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft));
                    }
                }

                // Between-line distance
                var neighbourBL = kdTreeBL.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomLeft, (p1, p2) => Distances.WeightedEuclidean(p1, p2, 50));
                foreach (var n in neighbourBL)
                {
                    if (betweenLine.Contains(Distances.Angle(word.BoundingBox.Centroid, n.Item1.BoundingBox.Centroid)))
                    {
                        betweenLineDistList.Add(Distances.Vertical(word.BoundingBox.BottomLeft, n.Item1.BoundingBox.TopLeft));
                    }
                }
            });

            double?withinLineDistance  = GetPeakAverageDistance(withinLineDistList);
            double?betweenLineDistance = GetPeakAverageDistance(betweenLineDistList);

            if (!withinLineDistance.HasValue || !betweenLineDistance.HasValue)
            {
                return(new[] { new TextBlock(new[] { new TextLine(wordsList) }) });
            }

            // 2. Find lines of text
            double maxDistanceWithinLine = Math.Min(3 * withinLineDistance.Value, Math.Sqrt(2) * betweenLineDistance.Value);
            var    lines = GetLines(wordsList, maxDistanceWithinLine, withinLine, maxDegreeOfParallelism).ToArray();

            // 3. Find blocks of text
            double maxDistanceBetweenLine = betweenLineMultiplier * betweenLineDistance.Value;
            var    blocks = GetLinesGroups(lines, maxDistanceBetweenLine, maxDegreeOfParallelism).ToList();

            // 4. Merge overlapping blocks - might happen in certain conditions, e.g. justified text.
            for (var b = 0; b < blocks.Count; b++)
            {
                if (blocks[b] == null)
                {
                    continue;
                }

                // Merge all lines (words)
                blocks[b] = new TextBlock(GetLines(blocks[b].TextLines.SelectMany(l => l.Words).ToList(),
                                                   double.MaxValue, withinLine, maxDegreeOfParallelism).ToList());

                for (var c = 0; c < blocks.Count; c++)
                {
                    if (b == c || blocks[c] == null)
                    {
                        continue;
                    }

                    if (blocks[b].BoundingBox.IntersectsWith(blocks[c].BoundingBox))
                    {
                        // Merge
                        // 1. Merge all words
                        var mergedWords = new List <Word>(blocks[b].TextLines.SelectMany(l => l.Words));
                        mergedWords.AddRange(blocks[c].TextLines.SelectMany(l => l.Words));

                        // 2. Rebuild lines, using max distance = +Inf as we know all words will be in the
                        // same block. Filtering will still be done based on angle.
                        // Merge all lines (words) sharing same bottom (baseline)
                        var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine, maxDegreeOfParallelism).ToList();
                        blocks[b] = new TextBlock(mergedLines.OrderByDescending(l => l.BoundingBox.Bottom).ToList());

                        // Remove
                        blocks[c] = null;
                    }
                }
            }

            return(blocks.Where(b => b != null).ToList());
        }
Пример #6
0
 /// <summary>
 /// Computes the angle.
 /// <para>0 ≤ θ ≤ 360</para>
 /// </summary>
 public double GetAngle()
 {
     return(Distances.BoundAngle0to360(Distances.Angle(this.P1, this.P2)));
 }