public void GetBlocks(string name, string[] expected) { if (name == "90 180 270 rotated.pdf") { // The 'TimesNewRomanPSMT' font is used by this particular document. Thus, results cannot be trusted on // platforms where this font isn't generally available (e.g. OSX, Linux, etc.), so we skip it! var font = SystemFontFinder.Instance.GetTrueTypeFont("TimesNewRomanPSMT"); Skip.If(font == null, "Skipped because the font TimesNewRomanPSMT could not be found in the execution environment."); } var options = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions() { LineSeparator = " " }; using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath(name))) { var page = document.GetPage(1); var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters); var blocks = new DocstrumBoundingBoxes(options).GetBlocks(words); Assert.Equal(expected.Length, blocks.Count); var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X) .ThenByDescending(b => b.BoundingBox.BottomLeft.Y).ToList(); for (int i = 0; i < orderedBlocks.Count; i++) { Assert.Equal(expected[i], orderedBlocks[i].Text); } } }
public void GetBlocksStatic(string name, string[] expected) { if (name == "90 180 270 rotated.pdf") { // The 'TimesNewRomanPSMT' font is used by this particular document. Thus, results cannot be trusted on // platforms where this font isn't generally available (e.g. OSX, Linux, etc.), so we skip it! var font = SystemFontFinder.Instance.GetTrueTypeFont("TimesNewRomanPSMT"); Skip.If(font == null, "Skipped because the font TimesNewRomanPSMT could not be found in the execution environment."); } var options = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions() { LineSeparator = " " }; using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath(name))) { var page = document.GetPage(1); var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters).ToList(); // Docstrum using static methods // Filter out white spaces words = words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).ToList(); var wlBounds = options.WithinLineBounds; var wlBinSize = options.WithinLineBinSize; var wlMultiplier = options.WithinLineMultiplier; var blBounds = options.BetweenLineBounds; var blBinSize = options.BetweenLineBinSize; var blMultiplier = options.BetweenLineMultiplier; var maxDegreeOfParallelism = options.MaxDegreeOfParallelism; var angularDifferenceBounds = options.AngularDifferenceBounds; var wordSeparator = options.WordSeparator; var lineSeparator = options.LineSeparator; var epsilon = options.Epsilon; // 1. Estimate within line and between line spacing if (!DocstrumBoundingBoxes.GetSpacingEstimation(words, wlBounds, wlBinSize, blBounds, blBinSize, maxDegreeOfParallelism, out double withinLineDistance, out double betweenLineDistance)) { if (double.IsNaN(withinLineDistance)) { withinLineDistance = 0; } if (double.IsNaN(betweenLineDistance)) { betweenLineDistance = 0; } } // 2. Determination of Text Lines double maxWithinLineDistance = wlMultiplier * withinLineDistance; var lines = DocstrumBoundingBoxes.GetLines(words, maxWithinLineDistance, wlBounds, wordSeparator, maxDegreeOfParallelism).ToArray(); // 3. Structural Block Determination double maxBetweenLineDistance = blMultiplier * betweenLineDistance; var blocks = DocstrumBoundingBoxes.GetStructuralBlocks(lines, maxBetweenLineDistance, angularDifferenceBounds, epsilon, lineSeparator, maxDegreeOfParallelism).ToList(); Assert.Equal(expected.Length, blocks.Count); var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X) .ThenByDescending(b => b.BoundingBox.BottomLeft.Y).ToList(); for (int i = 0; i < orderedBlocks.Count; i++) { Assert.Equal(expected[i], orderedBlocks[i].Text); } } }
//https://github.com/pdfminer/pdfminer.six/blob/f389b97923c7a847bc9c6f4c3374951e1a7ff764/pdfminer/layout.py#L593 /// <summary> /// group_objects: group text object to textlines. /// </summary> /// <param name="laparams"></param> /// <param name="objs"></param> /// <returns></returns> public IEnumerable <TextLine> group_objects(LAParams laparams, IEnumerable <Letter> objs) { Letter obj0 = null; TextLine line = null; foreach (var obj1 in objs) { if (obj0 != null) { // halign: obj0 and obj1 is horizontally aligned. // // +------+ - - - // | obj0 | - - +------+ - // | | | obj1 | | (line_overlap) // +------+ - - | | - // - - - +------+ // // |<--->| // (char_margin) var halign = obj0.is_compatible(obj1) && obj0.is_voverlap(obj1) && Math.Min(obj0.GlyphRectangle.Height, obj1.GlyphRectangle.Height) * laparams.line_overlap < obj0.voverlap(obj1) && obj0.hdistance(obj1) < Math.Max(obj0.GlyphRectangle.Width, obj1.GlyphRectangle.Width) * laparams.char_margin; var is_hoverlap = DocstrumBoundingBoxes.GetStructuralBlockingParameters(new PdfLine(obj0.StartBaseLine, obj0.EndBaseLine), new PdfLine(obj1.StartBaseLine, obj1.EndBaseLine), 1e-3, out double angularDifference, out double normalisedOverlap, out double perpendicularDistance); // valign: obj0 and obj1 is vertically aligned. // // +------+ // | obj0 | // | | // +------+ - - - // | | | (char_margin) // +------+ - - // | obj1 | // | | // +------+ // // |<-->| // (line_overlap) var valign = laparams.detect_vertical && obj0.is_compatible(obj1) && obj0.is_hoverlap(obj1) && Math.Min(obj0.GlyphRectangle.Width, obj1.GlyphRectangle.Width) * laparams.line_overlap < obj0.hoverlap(obj1) && obj0.vdistance(obj1) < Math.Max(obj0.GlyphRectangle.Height, obj1.GlyphRectangle.Height) * laparams.char_margin; if ((halign && line.isHorizontal()) || (valign && line.isVertical())) { //line.Add(obj1); throw new NotImplementedException(); } else if (line != null) { yield return(line); line = null; } else { if (valign && !halign) { throw new NotImplementedException(); } else if (halign && !valign) { throw new NotImplementedException(); } else { throw new NotImplementedException(); } } } } if (line == null) { //line = LTTextLineHorizontal(laparams.word_margin) //line.add(obj0) } yield return(line); }