コード例 #1
0
        public static AngleBounds ReadAngleBounds(this BinaryReader reader)
        {
            // Read an AngleBounds object from the stream.
            AngleBounds bounds = new AngleBounds();

            bounds.lower = reader.ReadSingle();
            bounds.upper = reader.ReadSingle();

            return(bounds);
        }
コード例 #2
0
        /// <summary>
        /// Get information on the nearest point, filtered for angle.
        /// </summary>
        private double?GetNearestPointDistance(List <Word> words, Word pivot, Func <PdfRectangle,
                                                                                    PdfPoint> funcPivotDist, Func <PdfRectangle, PdfPoint> funcPivotAngle,
                                               Func <PdfRectangle, PdfPoint> funcPointsDist, Func <PdfRectangle, PdfPoint> funcPointsAngle,
                                               AngleBounds angleBounds,
                                               Func <PdfPoint, PdfPoint, double> finalDistanceMeasure)
        {
            var pointR = funcPivotDist(pivot.BoundingBox);

            var pivotPoint = funcPivotAngle(pivot.BoundingBox);

            var wordsWithinAngleBoundDistancePoints = new List <PdfPoint>();

            // Filter to words within the angle range.
            foreach (var word in words)
            {
                // Ignore the pivot word.
                if (ReferenceEquals(word, pivot))
                {
                    continue;
                }

                var angle = Distances.Angle(pivotPoint, funcPointsAngle(word.BoundingBox));

                if (angleBounds.Contains(angle))
                {
                    wordsWithinAngleBoundDistancePoints.Add(funcPointsDist(word.BoundingBox));
                }
            }

            if (wordsWithinAngleBoundDistancePoints.Count == 0)
            {
                return(null);
            }

            var closestWordIndex = pointR.FindIndexNearest(wordsWithinAngleBoundDistancePoints, p => p,
                                                           p => p, Distances.Euclidean, out _);

            if (closestWordIndex < 0 || closestWordIndex >= wordsWithinAngleBoundDistancePoints.Count)
            {
                return(null);
            }

            return(finalDistanceMeasure(pointR, wordsWithinAngleBoundDistancePoints[closestWordIndex]));
        }
コード例 #3
0
        /// <summary>
        /// Get the blocks. See original paper for more information.
        /// </summary>
        /// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
        /// <param name="wlBounds">Angle bounds for words to be considered as neighbours on the same line.</param>
        /// <param name="wlMultiplier">Multiplier that gives the maximum euclidian distance between words for building lines.
        /// Maximum distance will be this number times the within-line distance found by the analysis.</param>
        /// <param name="wlBinSize">The bin size used when building the within-line distances distribution.</param>
        /// <param name="blBounds">Angle bounds for words to be considered as neighbours on separate lines.</param>
        /// <param name="blMultiplier">Multiplier that gives the maximum perpendicular distance between
        /// text lines for blocking. Maximum distance will be this number times the between-line
        /// distance found by the analysis.</param>
        /// <param name="blBinSize">The bin size used when building the between-line distances distribution.</param>
        /// <param name="angularDifferenceBounds">The angular difference bounds between two lines to be considered in the same block. This defines if two lines are parallel enough.</param>
        /// <param name="epsilon">Precision when testing equalities.</param>
        /// <param name="wordSeparator">Separator used between words when building lines.</param>
        /// <param name="lineSeparator">Separator used between lines when building paragraphs.</param>
        /// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
        /// <para>A positive property value limits the number of concurrent operations to the set value.
        /// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
        /// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
        private IReadOnlyList <TextBlock> GetBlocks(IReadOnlyList <Word> words,
                                                    AngleBounds wlBounds, double wlMultiplier, int wlBinSize,
                                                    AngleBounds blBounds, double blMultiplier, int blBinSize,
                                                    AngleBounds angularDifferenceBounds,
                                                    double epsilon,
                                                    string wordSeparator, string lineSeparator,
                                                    int maxDegreeOfParallelism)
        {
            // Filter out white spaces
            words = words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).ToList();
            if (words.Count == 0)
            {
                return(EmptyArray <TextBlock> .Instance);
            }

            // 1. Estimate within line and between line spacing
            if (!GetSpacingEstimation(words, wlBounds, wlBinSize, blBounds, blBinSize,
                                      maxDegreeOfParallelism,
                                      out double withinLineDistance, out double betweenLineDistance))
            {
                if (double.IsNaN(withinLineDistance))
                {
                    withinLineDistance = 0;
                }

                if (double.IsNaN(betweenLineDistance))
                {
                    betweenLineDistance = 0;
                }
            }

            // 2. Determination of Text Lines
            double maxWithinLineDistance = wlMultiplier * withinLineDistance;
            var    lines = GetLines(words, maxWithinLineDistance, wlBounds, wordSeparator, maxDegreeOfParallelism).ToArray();

            // 3. Structural Block Determination
            double maxBetweenLineDistance = blMultiplier * betweenLineDistance;

            return(GetStructuralBlocks(lines, maxBetweenLineDistance, angularDifferenceBounds, epsilon, lineSeparator, maxDegreeOfParallelism).ToList());
        }
コード例 #4
0
 public static void Write(this BinaryWriter writer, AngleBounds bounds)
 {
     // Write an AngleBounds object to the stream.
     writer.Write(bounds.lower);
     writer.Write(bounds.upper);
 }
コード例 #5
0
        /// <summary>
        /// Get the blocks. See original paper for more information.
        /// </summary>
        /// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
        /// <param name="withinLine">Angle bounds for words to be considered on the same line.</param>
        /// <param name="betweenLine">Angle bounds for words to be considered on separate lines.</param>
        /// <param name="betweenLineMultiplier">Multiplier that gives the maximum perpendicular distance between
        /// text lines for blocking. Maximum distance will be this number times the between-line
        /// distance found by the analysis.</param>
        /// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
        /// <para>A positive property value limits the number of concurrent operations to the set value.
        /// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
        /// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
        public IReadOnlyList <TextBlock> GetBlocks(IEnumerable <Word> words, AngleBounds withinLine,
                                                   AngleBounds betweenLine, double betweenLineMultiplier, int maxDegreeOfParallelism)
        {
            if (words == null)
            {
                return(EmptyArray <TextBlock> .Instance);
            }

            var wordsList = new List <Word>();

            foreach (var word in words)
            {
                if (string.IsNullOrWhiteSpace(word.Text))
                {
                    continue;
                }

                wordsList.Add(word);
            }

            if (wordsList.Count == 0)
            {
                return(EmptyArray <TextBlock> .Instance);
            }

            var withinLineDistList  = new ConcurrentBag <double>();
            var betweenLineDistList = new ConcurrentBag <double>();

            ParallelOptions parallelOptions = new ParallelOptions()
            {
                MaxDegreeOfParallelism = maxDegreeOfParallelism
            };

            // 1. Estimate within line and between line spacing
            KdTree <Word> kdTreeWL = new KdTree <Word>(wordsList, w => w.BoundingBox.BottomLeft);
            KdTree <Word> kdTreeBL = new KdTree <Word>(wordsList, w => w.BoundingBox.TopLeft);

            Parallel.For(0, wordsList.Count, parallelOptions, i =>
            {
                var word = wordsList[i];

                // Within-line distance
                var neighbourWL = kdTreeWL.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomRight, (p1, p2) => Distances.WeightedEuclidean(p1, p2, 0.5));
                foreach (var n in neighbourWL)
                {
                    if (withinLine.Contains(Distances.Angle(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft)))
                    {
                        withinLineDistList.Add(Distances.Horizontal(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft));
                    }
                }

                // Between-line distance
                var neighbourBL = kdTreeBL.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomLeft, (p1, p2) => Distances.WeightedEuclidean(p1, p2, 50));
                foreach (var n in neighbourBL)
                {
                    if (betweenLine.Contains(Distances.Angle(word.BoundingBox.Centroid, n.Item1.BoundingBox.Centroid)))
                    {
                        betweenLineDistList.Add(Distances.Vertical(word.BoundingBox.BottomLeft, n.Item1.BoundingBox.TopLeft));
                    }
                }
            });

            double?withinLineDistance  = GetPeakAverageDistance(withinLineDistList);
            double?betweenLineDistance = GetPeakAverageDistance(betweenLineDistList);

            if (!withinLineDistance.HasValue || !betweenLineDistance.HasValue)
            {
                return(new[] { new TextBlock(new[] { new TextLine(wordsList) }) });
            }

            // 2. Find lines of text
            double maxDistanceWithinLine = Math.Min(3 * withinLineDistance.Value, Math.Sqrt(2) * betweenLineDistance.Value);
            var    lines = GetLines(wordsList, maxDistanceWithinLine, withinLine, maxDegreeOfParallelism).ToArray();

            // 3. Find blocks of text
            double maxDistanceBetweenLine = betweenLineMultiplier * betweenLineDistance.Value;
            var    blocks = GetLinesGroups(lines, maxDistanceBetweenLine, maxDegreeOfParallelism).ToList();

            // 4. Merge overlapping blocks - might happen in certain conditions, e.g. justified text.
            for (var b = 0; b < blocks.Count; b++)
            {
                if (blocks[b] == null)
                {
                    continue;
                }

                // Merge all lines (words)
                blocks[b] = new TextBlock(GetLines(blocks[b].TextLines.SelectMany(l => l.Words).ToList(),
                                                   double.MaxValue, withinLine, maxDegreeOfParallelism).ToList());

                for (var c = 0; c < blocks.Count; c++)
                {
                    if (b == c || blocks[c] == null)
                    {
                        continue;
                    }

                    if (blocks[b].BoundingBox.IntersectsWith(blocks[c].BoundingBox))
                    {
                        // Merge
                        // 1. Merge all words
                        var mergedWords = new List <Word>(blocks[b].TextLines.SelectMany(l => l.Words));
                        mergedWords.AddRange(blocks[c].TextLines.SelectMany(l => l.Words));

                        // 2. Rebuild lines, using max distance = +Inf as we know all words will be in the
                        // same block. Filtering will still be done based on angle.
                        // Merge all lines (words) sharing same bottom (baseline)
                        var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine, maxDegreeOfParallelism).ToList();
                        blocks[b] = new TextBlock(mergedLines.OrderByDescending(l => l.BoundingBox.Bottom).ToList());

                        // Remove
                        blocks[c] = null;
                    }
                }
            }

            return(blocks.Where(b => b != null).ToList());
        }
コード例 #6
0
 /// <summary>
 /// Get the blocks. See original paper for more information.
 /// </summary>
 /// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
 /// <param name="withinLine">Angle bounds for words to be considered on the same line.</param>
 /// <param name="betweenLine">Angle bounds for words to be considered on separate lines.</param>
 /// <param name="betweenLineMultiplier">Multiplier that gives the maximum perpendicular distance between
 /// text lines for blocking. Maximum distance will be this number times the between-line
 /// distance found by the analysis.</param>
 /// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
 public IReadOnlyList <TextBlock> GetBlocks(IEnumerable <Word> words, AngleBounds withinLine,
                                            AngleBounds betweenLine, double betweenLineMultiplier)
 {
     return(GetBlocks(words, withinLine, betweenLine, betweenLineMultiplier, -1));
 }
コード例 #7
0
        private static IEnumerable <TextLine> GetLines(List <Word> words, double maxDist, AngleBounds withinLine, int maxDegreeOfParallelism)
        {
            TextDirection textDirection  = words[0].TextDirection;
            var           groupedIndexes = Clustering.NearestNeighbours(words, 2, Distances.Euclidean,
                                                                        (pivot, candidate) => maxDist,
                                                                        pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft,
                                                                        pivot => true,
                                                                        (pivot, candidate) => withinLine.Contains(Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft)),
                                                                        maxDegreeOfParallelism).ToList();

            Func <IEnumerable <Word>, IReadOnlyList <Word> > orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();

            if (textDirection == TextDirection.Rotate180)
            {
                orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Right).ToList();
            }
            else if (textDirection == TextDirection.Rotate90)
            {
                orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Top).ToList();
            }
            else if (textDirection == TextDirection.Rotate270)
            {
                orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList();
            }

            for (var a = 0; a < groupedIndexes.Count; a++)
            {
                yield return(new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i]))));
            }
        }
コード例 #8
0
        /// <summary>
        /// Get the blocks. See original paper for more information.
        /// </summary>
        /// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
        /// <param name="withinLine">Angle bounds for words to be considered on the same line.</param>
        /// <param name="betweenLine">Angle bounds for words to be considered on separate lines.</param>
        /// <param name="betweenLineMultiplier">Multiplier that gives the maximum perpendicular distance between
        /// text lines for blocking. Maximum distance will be this number times the between-line
        /// distance found by the analysis.</param>
        /// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
        public IReadOnlyList <TextBlock> GetBlocks(IEnumerable <Word> words, AngleBounds withinLine,
                                                   AngleBounds betweenLine,
                                                   double betweenLineMultiplier)
        {
            if (words == null)
            {
                return(EmptyArray <TextBlock> .Instance);
            }

            var wordsList = new List <Word>();

            foreach (var word in words)
            {
                if (string.IsNullOrWhiteSpace(word.Text))
                {
                    continue;
                }

                wordsList.Add(word);
            }

            if (wordsList.Count == 0)
            {
                return(EmptyArray <TextBlock> .Instance);
            }

            var withinLineDistList  = new ConcurrentBag <double>();
            var betweenLineDistList = new ConcurrentBag <double>();

            // 1. Estimate in line and between line spacing
            Parallel.For(0, wordsList.Count, i =>
            {
                var word = wordsList[i];

                // Within-line distance
                var pointsWithinLine = GetNearestPointDistance(wordsList, word,
                                                               bb => bb.BottomRight, bb => bb.BottomRight,
                                                               bb => bb.BottomLeft, bb => bb.BottomLeft,
                                                               withinLine, Distances.Horizontal);

                if (pointsWithinLine != null)
                {
                    withinLineDistList.Add(pointsWithinLine.Value);
                }

                // Between-line distance
                var pointsBetweenLine = GetNearestPointDistance(wordsList, word,
                                                                bb => bb.BottomLeft, bb => bb.Centroid,
                                                                bb => bb.TopLeft, bb => bb.Centroid,
                                                                betweenLine, Distances.Vertical);

                if (pointsBetweenLine != null)
                {
                    betweenLineDistList.Add(pointsBetweenLine.Value);
                }
            });

            double?withinLineDistance  = GetPeakAverageDistance(withinLineDistList);
            double?betweenLineDistance = GetPeakAverageDistance(betweenLineDistList);

            if (withinLineDistance == null || betweenLineDistance == null)
            {
                return(new[] { new TextBlock(new[] { new TextLine(wordsList) }) });
            }

            // 2. Find lines of text
            double maxDistanceWithinLine = Math.Min(3 * withinLineDistance.Value, Math.Sqrt(2) * betweenLineDistance.Value);
            var    lines = GetLines(wordsList, maxDistanceWithinLine, withinLine).ToArray();

            // 3. Find blocks of text
            double maxDistanceBetweenLine = betweenLineMultiplier * betweenLineDistance.Value;
            var    blocks = GetLinesGroups(lines, maxDistanceBetweenLine).ToList();

            // 4. Merge overlapping blocks - might happen in certain conditions, e.g. justified text.
            for (var b = 0; b < blocks.Count; b++)
            {
                if (blocks[b] == null)
                {
                    continue;
                }

                // Merge all lines (words)
                blocks[b] = new TextBlock(GetLines(blocks[b].TextLines.SelectMany(l => l.Words).ToList(),
                                                   double.MaxValue, withinLine).ToList());

                for (var c = 0; c < blocks.Count; c++)
                {
                    if (b == c || blocks[c] == null)
                    {
                        continue;
                    }

                    if (blocks[b].BoundingBox.IntersectsWith(blocks[c].BoundingBox))
                    {
                        // Merge
                        // 1. Merge all words
                        var mergedWords = new List <Word>(blocks[b].TextLines.SelectMany(l => l.Words));
                        mergedWords.AddRange(blocks[c].TextLines.SelectMany(l => l.Words));

                        // 2. Rebuild lines, using max distance = +Inf as we know all words will be in the
                        // same block. Filtering will still be done based on angle.
                        // Merge all lines (words) sharing same bottom (baseline)
                        var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine).ToList();
                        blocks[b] = new TextBlock(mergedLines.OrderByDescending(l => l.BoundingBox.Bottom).ToList());

                        // Remove
                        blocks[c] = null;
                    }
                }
            }

            return(blocks.Where(b => b != null).ToList());
        }
コード例 #9
0
        private static IEnumerable <TextLine> GetLines(List <Word> words, double maxDist, AngleBounds withinLine)
        {
            TextDirection textDirection  = words[0].TextDirection;
            var           groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(words, Distances.Euclidean,
                                                                                         (pivot, candidate) => maxDist,
                                                                                         pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft,
                                                                                         pivot => true,
                                                                                         (pivot, candidate) =>
            {
                // Compare bottom right with bottom left for angle
                var withinLineAngle = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft);

                return(withinLineAngle >= withinLine.Lower && withinLineAngle <= withinLine.Upper);
            }).ToList();

            Func <IEnumerable <Word>, IReadOnlyList <Word> > orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();

            if (textDirection == TextDirection.Rotate180)
            {
                orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Right).ToList();
            }
            else if (textDirection == TextDirection.Rotate90)
            {
                orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Top).ToList();
            }
            else if (textDirection == TextDirection.Rotate270)
            {
                orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList();
            }

            for (var a = 0; a < groupedIndexes.Count; a++)
            {
                yield return(new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i]))));
            }
        }
コード例 #10
0
        /// <summary>
        /// Perpendicular overlapping distance.
        /// </summary>
        /// <param name="line1"></param>
        /// <param name="line2"></param>
        /// <param name="angularDifferenceBounds"></param>
        /// <param name="epsilon"></param>
        private static double PerpendicularOverlappingDistance(PdfLine line1, PdfLine line2, AngleBounds angularDifferenceBounds, double epsilon)
        {
            if (GetStructuralBlockingParameters(line1, line2, epsilon, out double theta, out _, out double ed))
            {
                // Angle is kept within [-90;90]
                if (theta > 90)
                {
                    theta -= 180;
                }
                else if (theta < -90)
                {
                    theta += 180;
                }

                if (!angularDifferenceBounds.Contains(theta))
                {
                    // exclude because not parallel enough
                    return(double.PositiveInfinity);
                }

                return(Math.Abs(ed));
            }
コード例 #11
0
        /// <summary>
        /// Get the <see cref="TextBlock"/>s.
        /// <para>This is the Docstrum algorithm's 3rd and final step.</para>
        /// <para>
        /// Method: We want to measure the distance between two lines using the following method:
        /// <br>- We check if two lines are overlapping horizontally and compute the perpendicular distance.</br>
        /// <br>- We check if the angle between the two line is within 'angularDifference'.</br>
        /// <br>- If the two lines are not overlapping or the angle is too wide, the distance is set to the infinity.</br>
        /// <para>If two text lines are approximately parallel, close in perpendicular distance, and they either overlap to some specified degree or are separated by only a small distance in parallel distance, then they are said to meet the criteria to belong to the same structural block.</para>
        /// </para>
        /// </summary>
        /// <param name="lines">The lines to segment into <see cref="TextBlock"/>s.</param>
        /// <param name="maxBLDistance">The maximum between-line distance. Computed as the estimated between-line spacing times the between-line multiplier in the default implementation.</param>
        /// <param name="angularDifferenceBounds">The angular difference bounds between two lines to be considered in the same block. This defines if two lines are parallel enough.</param>
        /// <param name="epsilon">Precision when testing equalities.</param>
        /// <param name="lineSeparator">Separator used between lines when building paragraphs.</param>
        /// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
        /// <para>A positive property value limits the number of concurrent operations to the set value.
        /// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
        /// <returns>The <see cref="TextBlock"/>s built.</returns>
        public static IEnumerable <TextBlock> GetStructuralBlocks(IReadOnlyList <TextLine> lines,
                                                                  double maxBLDistance, AngleBounds angularDifferenceBounds, double epsilon, string lineSeparator, int maxDegreeOfParallelism)
        {
            /******************************************************************************************************
            * We want to measure the distance between two lines using the following method:
            *  We check if two lines are overlapping horizontally and compute the perpendicular distance.
            *  We check if the angle between the two line is within 'angularDifference'.
            *  If the two lines are not overlapping or the angle is too wide, the distance is set to the infinity.
            *
            *  If two text lines are approximately parallel, close in perpendicular distance, and they either
            *  overlap to some specified degree or are separated by only a small distance in parallel distance,
            *  then they are said to meet the criteria to belong to the same structural block.
            ******************************************************************************************************/

            var groupedLines = Clustering.NearestNeighbours(
                lines,
                (l1, l2) => PerpendicularOverlappingDistance(l1, l2, angularDifferenceBounds, epsilon),
                (_, __) => maxBLDistance,
                pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
                candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
                _ => true,
                (_, __) => true,
                maxDegreeOfParallelism).ToList();

            foreach (var g in groupedLines)
            {
                yield return(new TextBlock(g.OrderByReadingOrder(), lineSeparator));
            }
        }
コード例 #12
0
        /// <summary>
        /// Get the <see cref="TextLine"/>s by grouping words using nearest neighbours.
        /// <para>This is the Docstrum algorithm's 2nd step.</para>
        /// </summary>
        /// <param name="words">The words to segment into <see cref="TextLine"/>s.</param>
        /// <param name="maxWLDistance">The maximum within-line distance. Computed as the estimated within-line spacing times the within-line multiplier in the default implementation.</param>
        /// <param name="wlBounds">Angle bounds for words to be considered as neighbours on the same line.</param>
        /// <param name="wordSeparator">Separator used between words when building lines.</param>
        /// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
        /// <para>A positive property value limits the number of concurrent operations to the set value.
        /// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
        /// <returns>The <see cref="TextLine"/>s built.</returns>
        public static IEnumerable <TextLine> GetLines(IReadOnlyList <Word> words, double maxWLDistance, AngleBounds wlBounds,
                                                      string wordSeparator, int maxDegreeOfParallelism)
        {
            var groupedWords = Clustering.NearestNeighbours(words,
                                                            2,
                                                            Distances.Euclidean,
                                                            (_, __) => maxWLDistance,
                                                            pivot => pivot.BoundingBox.BottomRight,
                                                            candidate => candidate.BoundingBox.BottomLeft,
                                                            _ => true,
                                                            (pivot, candidate) => wlBounds.Contains(AngleWL(pivot, candidate)),
                                                            maxDegreeOfParallelism).ToList();

            foreach (var g in groupedWords)
            {
                yield return(new TextLine(g.OrderByReadingOrder(), wordSeparator));
            }
        }
コード例 #13
0
        /// <summary>
        /// Estimation of within-line and between-line spacing.
        /// <para>This is the Docstrum algorithm's 1st step.</para>
        /// </summary>
        /// <param name="words">The list of words.</param>
        /// <param name="wlBounds">Angle bounds for words to be considered as neighbours on the same line.</param>
        /// <param name="wlBinSize">The bin size used when building the within-line distances distribution.</param>
        /// <param name="blBounds">Angle bounds for words to be considered as neighbours on separate lines.</param>
        /// <param name="blBinSize">The bin size used when building the between-line distances distribution.</param>
        /// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
        /// <para>A positive property value limits the number of concurrent operations to the set value.
        /// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
        /// <param name="withinLineDistance">The estimated within-line distance. Computed as the average peak value of distribution.</param>
        /// <param name="betweenLineDistance">The estimated between-line distance. Computed as the average peak value of distribution.</param>
        /// <returns>False if either 'withinLineDistance' or 'betweenLineDistance' is <see cref="double.NaN"/>.</returns>
        public static bool GetSpacingEstimation(IReadOnlyList <Word> words,
                                                AngleBounds wlBounds, int wlBinSize,
                                                AngleBounds blBounds, int blBinSize,
                                                int maxDegreeOfParallelism,
                                                out double withinLineDistance, out double betweenLineDistance)
        {
            ParallelOptions parallelOptions = new ParallelOptions()
            {
                MaxDegreeOfParallelism = maxDegreeOfParallelism
            };

            var withinLineDistList  = new ConcurrentBag <double>();
            var betweenLineDistList = new ConcurrentBag <double>();

            // 1. Estimate within line and between line spacing
            KdTree <Word> kdTreeBottomLeft = new KdTree <Word>(words, w => w.BoundingBox.BottomLeft);

            Parallel.For(0, words.Count, parallelOptions, i =>
            {
                var word = words[i];

                // Within-line distance
                // 1.1.1 Find the 2 closest neighbours words to the candidate, using euclidean distance.
                foreach (var n in kdTreeBottomLeft.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomRight, Distances.Euclidean))
                {
                    // 1.1.2 Check if the neighbour word is within the angle of the candidate
                    if (wlBounds.Contains(AngleWL(word, n.Item1)))
                    {
                        withinLineDistList.Add(Distances.Euclidean(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft));
                    }
                }

                // Between-line distance
                // 1.2.1 Find the 2 closest neighbours words to the candidate, using euclidean distance.
                foreach (var n in kdTreeBottomLeft.FindNearestNeighbours(word, 2, w => w.BoundingBox.TopLeft, Distances.Euclidean))
                {
                    // 1.2.2 Check if the candidate words is within the angle
                    var angle = AngleBL(word, n.Item1);
                    if (blBounds.Contains(angle))
                    {
                        // 1.2.3 Compute the vertical (between-line) distance between the candidate
                        // and the neighbour and add it to the between-line distances list
                        double hypotenuse = Distances.Euclidean(word.BoundingBox.Centroid, n.Item1.BoundingBox.Centroid);

                        // Angle is kept within [-90, 90]
                        if (angle > 90)
                        {
                            angle -= 180;
                        }

                        var dist = Math.Abs(hypotenuse * Math.Cos((90 - angle) * Math.PI / 180))
                                   - word.BoundingBox.Height / 2.0 - n.Item1.BoundingBox.Height / 2.0;

                        // The perpendicular distance can be negative because of the subtractions.
                        // Could occur when words are overlapping, we ignore that.
                        if (dist >= 0)
                        {
                            betweenLineDistList.Add(dist);
                        }
                    }
                }
            });

            // Compute average peak value of distribution
            double?withinLinePeak  = GetPeakAverageDistance(withinLineDistList, wlBinSize);
            double?betweenLinePeak = GetPeakAverageDistance(betweenLineDistList, blBinSize);

            withinLineDistance  = withinLinePeak ?? double.NaN;
            betweenLineDistance = betweenLinePeak ?? double.NaN;

            return(withinLinePeak.HasValue && betweenLinePeak.HasValue);
        }
コード例 #14
0
        /// <summary>
        /// Build lines via transitive closure.
        /// </summary>
        private static IEnumerable <TextLine> GetLines(List <Word> words, double maxDist, AngleBounds withinLine)
        {
            /***************************************************************************************************
            * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'word Width', the algo might not
            * work as the FindIndexNearest() function might pair the pivot with itself (the pivot's right point
            * (distance = width) is closer than other words' left point).
            * -> Solution would be to find more than one nearest neighbours. Use KDTree?
            ***************************************************************************************************/

            TextDirection textDirection  = words[0].TextDirection;
            var           groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(words, Distances.Euclidean,
                                                                                        (pivot, candidate) => maxDist,
                                                                                        pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft,
                                                                                        pivot => true,
                                                                                        (pivot, candidate) =>
            {
                // Compare bottom right with bottom left for angle
                var withinLineAngle = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft);

                return(withinLineAngle >= withinLine.Lower && withinLineAngle <= withinLine.Upper);
            }).ToList();

            Func <IEnumerable <Word>, IReadOnlyList <Word> > orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();

            if (textDirection == TextDirection.Rotate180)
            {
                orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Right).ToList();
            }
            else if (textDirection == TextDirection.Rotate90)
            {
                orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Top).ToList();
            }
            else if (textDirection == TextDirection.Rotate270)
            {
                orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList();
            }

            for (var a = 0; a < groupedIndexes.Count; a++)
            {
                yield return(new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i]))));
            }
        }