private static IEnumerable<TextLine> GetLines(List<Word> words, double maxDist, AngleBounds withinLine, int maxDegreeOfParallelism) { TextDirection textDirection = words[0].TextDirection; var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(words, 2, Distances.Euclidean, (pivot, candidate) => maxDist, pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft, pivot => true, (pivot, candidate) => withinLine.Contains(Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft)), maxDegreeOfParallelism).ToList(); Func<IEnumerable<Word>, IReadOnlyList<Word>> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList(); if (textDirection == TextDirection.Rotate180) { orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Right).ToList(); } else if (textDirection == TextDirection.Rotate90) { orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Top).ToList(); } else if (textDirection == TextDirection.Rotate270) { orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList(); } for (var a = 0; a < groupedIndexes.Count; a++) { yield return new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i]))); } }
// NEED TO IMPLEMENT ROUNDING private static PdfRectangle GetBoundingBoxOther(Letter letter) { // not very useful, need axis aligned bbox anyway // -> rotate back? or normalise? var points = new[] { letter.StartBaseLine, letter.EndBaseLine, letter.GlyphRectangle.TopLeft, letter.GlyphRectangle.TopRight }; // Candidates bounding boxes var obb = GeometryExtensions.MinimumAreaRectangle(points); var obb1 = new PdfRectangle(obb.BottomLeft, obb.TopLeft, obb.BottomRight, obb.TopRight); var obb2 = new PdfRectangle(obb.BottomRight, obb.BottomLeft, obb.TopRight, obb.TopLeft); var obb3 = new PdfRectangle(obb.TopRight, obb.BottomRight, obb.TopLeft, obb.BottomLeft); // Find the orientation of the OBB, using the baseline angle // Assumes line order is correct var baseLineAngle = Distances.BoundAngle180(Distances.Angle(letter.GlyphRectangle.BottomLeft, letter.GlyphRectangle.BottomRight)); double deltaAngle = Math.Abs(Distances.BoundAngle180(obb.Rotation - baseLineAngle)); double deltaAngle1 = Math.Abs(Distances.BoundAngle180(obb1.Rotation - baseLineAngle)); if (deltaAngle1 < deltaAngle) { deltaAngle = deltaAngle1; obb = obb1; } double deltaAngle2 = Math.Abs(Distances.BoundAngle180(obb2.Rotation - baseLineAngle)); if (deltaAngle2 < deltaAngle) { deltaAngle = deltaAngle2; obb = obb2; } double deltaAngle3 = Math.Abs(Distances.BoundAngle180(obb3.Rotation - baseLineAngle)); if (deltaAngle3 < deltaAngle) { obb = obb3; } return(obb); }
/// <summary> /// Helper function to compute the within line angle between the pivot's bottom /// right and the candidate's bottom left points, taking in account the pivot's rotation. /// <para>-90 ≤ θ ≤ 90.</para> /// </summary> private static double AngleWL(Word pivot, Word candidate) { var angle = Distances.BoundAngle180(Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft) - pivot.BoundingBox.Rotation); // Angle is kept within [-90;90] degree to handle overlapping words if (angle > 90) { angle -= 180; } else if (angle < -90) { angle += 180; } return(angle); }
/// <summary> /// Get information on the nearest point, filtered for angle. /// </summary> private double?GetNearestPointDistance(List <Word> words, Word pivot, Func <PdfRectangle, PdfPoint> funcPivotDist, Func <PdfRectangle, PdfPoint> funcPivotAngle, Func <PdfRectangle, PdfPoint> funcPointsDist, Func <PdfRectangle, PdfPoint> funcPointsAngle, AngleBounds angleBounds, Func <PdfPoint, PdfPoint, double> finalDistanceMeasure) { var pointR = funcPivotDist(pivot.BoundingBox); var pivotPoint = funcPivotAngle(pivot.BoundingBox); var wordsWithinAngleBoundDistancePoints = new List <PdfPoint>(); // Filter to words within the angle range. foreach (var word in words) { // Ignore the pivot word. if (ReferenceEquals(word, pivot)) { continue; } var angle = Distances.Angle(pivotPoint, funcPointsAngle(word.BoundingBox)); if (angleBounds.Contains(angle)) { wordsWithinAngleBoundDistancePoints.Add(funcPointsDist(word.BoundingBox)); } } if (wordsWithinAngleBoundDistancePoints.Count == 0) { return(null); } var closestWordIndex = Distances.FindIndexNearest(pointR, wordsWithinAngleBoundDistancePoints, p => p, p => p, Distances.Euclidean, out _); if (closestWordIndex < 0 || closestWordIndex >= wordsWithinAngleBoundDistancePoints.Count) { return(null); } return(finalDistanceMeasure(pointR, wordsWithinAngleBoundDistancePoints[closestWordIndex])); }
/// <summary> /// Get the blocks. See original paper for more information. /// </summary> /// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param> /// <param name="withinLine">Angle bounds for words to be considered on the same line.</param> /// <param name="betweenLine">Angle bounds for words to be considered on separate lines.</param> /// <param name="betweenLineMultiplier">Multiplier that gives the maximum perpendicular distance between /// text lines for blocking. Maximum distance will be this number times the between-line /// distance found by the analysis.</param> /// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled. /// <para>A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations.</para></param> /// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns> public IReadOnlyList <TextBlock> GetBlocks(IEnumerable <Word> words, AngleBounds withinLine, AngleBounds betweenLine, double betweenLineMultiplier, int maxDegreeOfParallelism) { if (words == null) { return(EmptyArray <TextBlock> .Instance); } var wordsList = new List <Word>(); foreach (var word in words) { if (string.IsNullOrWhiteSpace(word.Text)) { continue; } wordsList.Add(word); } if (wordsList.Count == 0) { return(EmptyArray <TextBlock> .Instance); } var withinLineDistList = new ConcurrentBag <double>(); var betweenLineDistList = new ConcurrentBag <double>(); ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; // 1. Estimate within line and between line spacing KdTree <Word> kdTreeWL = new KdTree <Word>(wordsList, w => w.BoundingBox.BottomLeft); KdTree <Word> kdTreeBL = new KdTree <Word>(wordsList, w => w.BoundingBox.TopLeft); Parallel.For(0, wordsList.Count, parallelOptions, i => { var word = wordsList[i]; // Within-line distance var neighbourWL = kdTreeWL.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomRight, (p1, p2) => Distances.WeightedEuclidean(p1, p2, 0.5)); foreach (var n in neighbourWL) { if (withinLine.Contains(Distances.Angle(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft))) { withinLineDistList.Add(Distances.Horizontal(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft)); } } // Between-line distance var neighbourBL = kdTreeBL.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomLeft, (p1, p2) => Distances.WeightedEuclidean(p1, p2, 50)); foreach (var n in neighbourBL) { if (betweenLine.Contains(Distances.Angle(word.BoundingBox.Centroid, n.Item1.BoundingBox.Centroid))) { betweenLineDistList.Add(Distances.Vertical(word.BoundingBox.BottomLeft, n.Item1.BoundingBox.TopLeft)); } } }); double?withinLineDistance = GetPeakAverageDistance(withinLineDistList); double?betweenLineDistance = GetPeakAverageDistance(betweenLineDistList); if (!withinLineDistance.HasValue || !betweenLineDistance.HasValue) { return(new[] { new TextBlock(new[] { new TextLine(wordsList) }) }); } // 2. Find lines of text double maxDistanceWithinLine = Math.Min(3 * withinLineDistance.Value, Math.Sqrt(2) * betweenLineDistance.Value); var lines = GetLines(wordsList, maxDistanceWithinLine, withinLine, maxDegreeOfParallelism).ToArray(); // 3. Find blocks of text double maxDistanceBetweenLine = betweenLineMultiplier * betweenLineDistance.Value; var blocks = GetLinesGroups(lines, maxDistanceBetweenLine, maxDegreeOfParallelism).ToList(); // 4. Merge overlapping blocks - might happen in certain conditions, e.g. justified text. for (var b = 0; b < blocks.Count; b++) { if (blocks[b] == null) { continue; } // Merge all lines (words) blocks[b] = new TextBlock(GetLines(blocks[b].TextLines.SelectMany(l => l.Words).ToList(), double.MaxValue, withinLine, maxDegreeOfParallelism).ToList()); for (var c = 0; c < blocks.Count; c++) { if (b == c || blocks[c] == null) { continue; } if (blocks[b].BoundingBox.IntersectsWith(blocks[c].BoundingBox)) { // Merge // 1. Merge all words var mergedWords = new List <Word>(blocks[b].TextLines.SelectMany(l => l.Words)); mergedWords.AddRange(blocks[c].TextLines.SelectMany(l => l.Words)); // 2. Rebuild lines, using max distance = +Inf as we know all words will be in the // same block. Filtering will still be done based on angle. // Merge all lines (words) sharing same bottom (baseline) var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine, maxDegreeOfParallelism).ToList(); blocks[b] = new TextBlock(mergedLines.OrderByDescending(l => l.BoundingBox.Bottom).ToList()); // Remove blocks[c] = null; } } } return(blocks.Where(b => b != null).ToList()); }
/// <summary> /// Computes the angle. /// <para>0 ≤ θ ≤ 360</para> /// </summary> public double GetAngle() { return(Distances.BoundAngle0to360(Distances.Angle(this.P1, this.P2))); }