public void GetBlocks(string name, string[] expected)
        {
            if (name == "90 180 270 rotated.pdf")
            {
                // The 'TimesNewRomanPSMT' font is used by this particular document. Thus, results cannot be trusted on
                // platforms where this font isn't generally available (e.g. OSX, Linux, etc.), so we skip it!
                var font = SystemFontFinder.Instance.GetTrueTypeFont("TimesNewRomanPSMT");
                Skip.If(font == null, "Skipped because the font TimesNewRomanPSMT could not be found in the execution environment.");
            }

            var options = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions()
            {
                LineSeparator = " "
            };

            using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath(name)))
            {
                var page   = document.GetPage(1);
                var words  = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
                var blocks = new DocstrumBoundingBoxes(options).GetBlocks(words);

                Assert.Equal(expected.Length, blocks.Count);
                var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X)
                                    .ThenByDescending(b => b.BoundingBox.BottomLeft.Y).ToList();

                for (int i = 0; i < orderedBlocks.Count; i++)
                {
                    Assert.Equal(expected[i], orderedBlocks[i].Text);
                }
            }
        }
        public void GetBlocksStatic(string name, string[] expected)
        {
            if (name == "90 180 270 rotated.pdf")
            {
                // The 'TimesNewRomanPSMT' font is used by this particular document. Thus, results cannot be trusted on
                // platforms where this font isn't generally available (e.g. OSX, Linux, etc.), so we skip it!
                var font = SystemFontFinder.Instance.GetTrueTypeFont("TimesNewRomanPSMT");
                Skip.If(font == null, "Skipped because the font TimesNewRomanPSMT could not be found in the execution environment.");
            }

            var options = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions()
            {
                LineSeparator = " "
            };

            using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath(name)))
            {
                var page  = document.GetPage(1);
                var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters).ToList();

                // Docstrum using static methods
                // Filter out white spaces
                words = words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).ToList();

                var wlBounds                = options.WithinLineBounds;
                var wlBinSize               = options.WithinLineBinSize;
                var wlMultiplier            = options.WithinLineMultiplier;
                var blBounds                = options.BetweenLineBounds;
                var blBinSize               = options.BetweenLineBinSize;
                var blMultiplier            = options.BetweenLineMultiplier;
                var maxDegreeOfParallelism  = options.MaxDegreeOfParallelism;
                var angularDifferenceBounds = options.AngularDifferenceBounds;
                var wordSeparator           = options.WordSeparator;
                var lineSeparator           = options.LineSeparator;
                var epsilon = options.Epsilon;

                // 1. Estimate within line and between line spacing
                if (!DocstrumBoundingBoxes.GetSpacingEstimation(words, wlBounds, wlBinSize, blBounds, blBinSize,
                                                                maxDegreeOfParallelism,
                                                                out double withinLineDistance, out double betweenLineDistance))
                {
                    if (double.IsNaN(withinLineDistance))
                    {
                        withinLineDistance = 0;
                    }

                    if (double.IsNaN(betweenLineDistance))
                    {
                        betweenLineDistance = 0;
                    }
                }

                // 2. Determination of Text Lines
                double maxWithinLineDistance = wlMultiplier * withinLineDistance;
                var    lines = DocstrumBoundingBoxes.GetLines(words, maxWithinLineDistance, wlBounds, wordSeparator, maxDegreeOfParallelism).ToArray();

                // 3. Structural Block Determination
                double maxBetweenLineDistance = blMultiplier * betweenLineDistance;
                var    blocks = DocstrumBoundingBoxes.GetStructuralBlocks(lines, maxBetweenLineDistance, angularDifferenceBounds, epsilon, lineSeparator, maxDegreeOfParallelism).ToList();

                Assert.Equal(expected.Length, blocks.Count);
                var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X)
                                    .ThenByDescending(b => b.BoundingBox.BottomLeft.Y).ToList();

                for (int i = 0; i < orderedBlocks.Count; i++)
                {
                    Assert.Equal(expected[i], orderedBlocks[i].Text);
                }
            }
        }
Beispiel #3
0
        //https://github.com/pdfminer/pdfminer.six/blob/f389b97923c7a847bc9c6f4c3374951e1a7ff764/pdfminer/layout.py#L593
        /// <summary>
        /// group_objects: group text object to textlines.
        /// </summary>
        /// <param name="laparams"></param>
        /// <param name="objs"></param>
        /// <returns></returns>
        public IEnumerable <TextLine> group_objects(LAParams laparams, IEnumerable <Letter> objs)
        {
            Letter   obj0 = null;
            TextLine line = null;

            foreach (var obj1 in objs)
            {
                if (obj0 != null)
                {
                    // halign: obj0 and obj1 is horizontally aligned.
                    //
                    //   +------+ - - -
                    //   | obj0 | - - +------+   -
                    //   |      |     | obj1 |   | (line_overlap)
                    //   +------+ - - |      |   -
                    //          - - - +------+
                    //
                    //          |<--->|
                    //        (char_margin)
                    var halign = obj0.is_compatible(obj1) && obj0.is_voverlap(obj1) &&
                                 Math.Min(obj0.GlyphRectangle.Height, obj1.GlyphRectangle.Height) * laparams.line_overlap < obj0.voverlap(obj1) &&
                                 obj0.hdistance(obj1) < Math.Max(obj0.GlyphRectangle.Width, obj1.GlyphRectangle.Width) * laparams.char_margin;

                    var is_hoverlap = DocstrumBoundingBoxes.GetStructuralBlockingParameters(new PdfLine(obj0.StartBaseLine, obj0.EndBaseLine), new PdfLine(obj1.StartBaseLine, obj1.EndBaseLine), 1e-3,
                                                                                            out double angularDifference, out double normalisedOverlap, out double perpendicularDistance);

                    // valign: obj0 and obj1 is vertically aligned.
                    //
                    //   +------+
                    //   | obj0 |
                    //   |      |
                    //   +------+ - - -
                    //     |    |     | (char_margin)
                    //     +------+ - -
                    //     | obj1 |
                    //     |      |
                    //     +------+
                    //
                    //     |<-->|
                    //   (line_overlap)
                    var valign = laparams.detect_vertical && obj0.is_compatible(obj1) && obj0.is_hoverlap(obj1) &&
                                 Math.Min(obj0.GlyphRectangle.Width, obj1.GlyphRectangle.Width) * laparams.line_overlap < obj0.hoverlap(obj1) &&
                                 obj0.vdistance(obj1) < Math.Max(obj0.GlyphRectangle.Height, obj1.GlyphRectangle.Height) * laparams.char_margin;



                    if ((halign && line.isHorizontal()) || (valign && line.isVertical()))
                    {
                        //line.Add(obj1);
                        throw new NotImplementedException();
                    }
                    else if (line != null)
                    {
                        yield return(line);

                        line = null;
                    }
                    else
                    {
                        if (valign && !halign)
                        {
                            throw new NotImplementedException();
                        }
                        else if (halign && !valign)
                        {
                            throw new NotImplementedException();
                        }
                        else
                        {
                            throw new NotImplementedException();
                        }
                    }
                }
            }

            if (line == null)
            {
                //line = LTTextLineHorizontal(laparams.word_margin)
                //line.add(obj0)
            }
            yield return(line);
        }