/// <summary> /// Private method to get the words. /// </summary> /// <param name="pageLetters">The letters in the page, they must have /// the same text directions.</param> /// <param name="metric">The letter's metric to use in the minimum distance /// between 2 letters, e.g. GlyphRectangle.Width or GlyphRectangle.Height.</param> /// <param name="distMeasure">The distance measure between two start and end base line points, /// e.g. the Manhattan distance.</param> private List <Word> GetWords(IEnumerable <Letter> pageLetters, Func <Letter, decimal> metric, Func <PdfPoint, PdfPoint, double> distMeasure) { if (pageLetters == null || pageLetters.Count() == 0) { return(new List <Word>()); } TextDirection textDirection = pageLetters.ElementAt(0).TextDirection; if (pageLetters.Any(x => textDirection != x.TextDirection)) { throw new ArgumentException("NNWordExtractor.GetWords(): Mixed Text Direction."); } Func <IEnumerable <Letter>, IReadOnlyList <Letter> > orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Left).ToList(); if (textDirection == TextDirection.Rotate180) { orderFunc = l => l.OrderByDescending(x => x.GlyphRectangle.Right).ToList(); } else if (textDirection == TextDirection.Rotate90) { orderFunc = l => l.OrderByDescending(x => x.GlyphRectangle.Top).ToList(); } else if (textDirection == TextDirection.Rotate270) { orderFunc = l => l.OrderBy(x => x.GlyphRectangle.Bottom).ToList(); } Letter[] letters = pageLetters.ToArray(); var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(letters, distMeasure, (l1, l2) => Math.Max((double)metric(l1), (double)metric(l2)) * 0.60, l => l.EndBaseLine, l => l.StartBaseLine, l => !string.IsNullOrWhiteSpace(l.Value), (l1, l2) => string.Equals(l1.FontName, l2.FontName, StringComparison.OrdinalIgnoreCase) && !string.IsNullOrWhiteSpace(l2.Value)).ToList(); List <Word> words = new List <Word>(); for (int a = 0; a < groupedIndexes.Count(); a++) { words.Add(new Word(orderFunc(groupedIndexes[a].Select(i => letters[i])))); } return(words); }
/// <summary> /// Build lines via transitive closure. /// </summary> private static IEnumerable <TextLine> GetLines(List <Word> words, double maxDist, AngleBounds withinLine) { /*************************************************************************************************** * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'word Width', the algo might not * work as the FindIndexNearest() function might pair the pivot with itself (the pivot's right point * (distance = width) is closer than other words' left point). * -> Solution would be to find more than one nearest neighbours. Use KDTree? ***************************************************************************************************/ TextDirection textDirection = words[0].TextDirection; var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(words, Distances.Euclidean, (pivot, candidate) => maxDist, pivot => pivot.BoundingBox.BottomRight, candidate => candidate.BoundingBox.BottomLeft, pivot => true, (pivot, candidate) => { // Compare bottom right with bottom left for angle var withinLineAngle = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft); return(withinLineAngle >= withinLine.Lower && withinLineAngle <= withinLine.Upper); }).ToList(); Func <IEnumerable <Word>, IReadOnlyList <Word> > orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList(); if (textDirection == TextDirection.Rotate180) { orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Right).ToList(); } else if (textDirection == TextDirection.Rotate90) { orderFunc = l => l.OrderByDescending(x => x.BoundingBox.Top).ToList(); } else if (textDirection == TextDirection.Rotate270) { orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList(); } for (var a = 0; a < groupedIndexes.Count; a++) { yield return(new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i])))); } }
/// <summary> /// Build blocks via transitive closure. /// </summary> private static IEnumerable <TextBlock> GetLinesGroups(TextLine[] lines, double maxDist) { /************************************************************************************************** * We want to measure the distance between two lines using the following method: * We check if two lines are overlapping horizontally. * If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area. * We finally compute the Euclidean distance between these two middle points. * If the two lines are not overlapping, the distance is set to the max distance. * * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'line Height', the algo won't * work as the FindIndexNearest() function will always pair the pivot with itself (the pivot's top * point (distance = height) is closer than other lines' top point). * -> Solution would be to find more than one nearest neighbours. Use KDTree? **************************************************************************************************/ Func <PdfLine, PdfLine, double> euclidianOverlappingMiddleDistance = (l1, l2) => { var left = Math.Max(l1.Point1.X, l2.Point1.X); var d = (Math.Min(l1.Point2.X, l2.Point2.X) - left); if (d < 0) { return(double.MaxValue); // not overlapping -> max distance } return(Distances.Euclidean( new PdfPoint(left + d / 2, l1.Point1.Y), new PdfPoint(left + d / 2, l2.Point1.Y))); }; var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines, euclidianOverlappingMiddleDistance, (pivot, candidate) => maxDist, pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight), candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight), pivot => true, (pivot, candidate) => true).ToList(); for (int a = 0; a < groupedIndexes.Count(); a++) { yield return(new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList())); } }