public void GetBlocks(string name, string[] expected)
        {
            if (name == "90 180 270 rotated.pdf")
            {
                // The 'TimesNewRomanPSMT' font is used by this particular document. Thus, results cannot be trusted on
                // platforms where this font isn't generally available (e.g. OSX, Linux, etc.), so we skip it!
                var font = SystemFontFinder.Instance.GetTrueTypeFont("TimesNewRomanPSMT");
                Skip.If(font == null, "Skipped because the font TimesNewRomanPSMT could not be found in the execution environment.");
            }

            var options = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions()
            {
                LineSeparator = " "
            };

            using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath(name)))
            {
                var page   = document.GetPage(1);
                var words  = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
                var blocks = DocstrumBoundingBoxes.Instance.GetBlocks(words, options);

                Assert.Equal(expected.Length, blocks.Count);
                var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X)
                                    .ThenByDescending(b => b.BoundingBox.BottomLeft.Y).ToList();

                for (int i = 0; i < orderedBlocks.Count; i++)
                {
                    Assert.Equal(expected[i], orderedBlocks[i].Text);
                }
            }
        }
Example #2
0
        public void GetBlocks(string name, string[] expected)
        {
            using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath(name)))
            {
                var page    = document.GetPage(1);
                var words   = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
                var options = new RecursiveXYCut.RecursiveXYCutOptions()
                {
                    MinimumWidth = page.Width / 3.0, LineSeparator = " "
                };
                var blocks = RecursiveXYCut.Instance.GetBlocks(words, options);

                Assert.Equal(expected.Length, blocks.Count);
                var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X)
                                    .ThenByDescending(b => b.BoundingBox.BottomLeft.Y).ToList();

                for (int i = 0; i < orderedBlocks.Count; i++)
                {
                    Assert.Equal(expected[i], orderedBlocks[i].Text);
                }
            }
        }
        public void GetBlocksStatic(string name, string[] expected)
        {
            if (name == "90 180 270 rotated.pdf")
            {
                // The 'TimesNewRomanPSMT' font is used by this particular document. Thus, results cannot be trusted on
                // platforms where this font isn't generally available (e.g. OSX, Linux, etc.), so we skip it!
                var font = SystemFontFinder.Instance.GetTrueTypeFont("TimesNewRomanPSMT");
                Skip.If(font == null, "Skipped because the font TimesNewRomanPSMT could not be found in the execution environment.");
            }

            var options = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions()
            {
                LineSeparator = " "
            };

            using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath(name)))
            {
                var page  = document.GetPage(1);
                var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters).ToList();

                // Docstrum using static methods
                // Filter out white spaces
                words = words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).ToList();

                var wlBounds                = options.WithinLineBounds;
                var wlBinSize               = options.WithinLineBinSize;
                var wlMultiplier            = options.WithinLineMultiplier;
                var blBounds                = options.BetweenLineBounds;
                var blBinSize               = options.BetweenLineBinSize;
                var blMultiplier            = options.BetweenLineMultiplier;
                var maxDegreeOfParallelism  = options.MaxDegreeOfParallelism;
                var angularDifferenceBounds = options.AngularDifferenceBounds;
                var wordSeparator           = options.WordSeparator;
                var lineSeparator           = options.LineSeparator;
                var epsilon = options.Epsilon;

                // 1. Estimate within line and between line spacing
                if (!DocstrumBoundingBoxes.GetSpacingEstimation(words, wlBounds, wlBinSize, blBounds, blBinSize,
                                                                maxDegreeOfParallelism,
                                                                out double withinLineDistance, out double betweenLineDistance))
                {
                    if (double.IsNaN(withinLineDistance))
                    {
                        withinLineDistance = 0;
                    }

                    if (double.IsNaN(betweenLineDistance))
                    {
                        betweenLineDistance = 0;
                    }
                }

                // 2. Determination of Text Lines
                double maxWithinLineDistance = wlMultiplier * withinLineDistance;
                var    lines = DocstrumBoundingBoxes.GetLines(words, maxWithinLineDistance, wlBounds, wordSeparator, maxDegreeOfParallelism).ToArray();

                // 3. Structural Block Determination
                double maxBetweenLineDistance = blMultiplier * betweenLineDistance;
                var    blocks = DocstrumBoundingBoxes.GetStructuralBlocks(lines, maxBetweenLineDistance, angularDifferenceBounds, epsilon, lineSeparator, maxDegreeOfParallelism).ToList();

                Assert.Equal(expected.Length, blocks.Count);
                var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X)
                                    .ThenByDescending(b => b.BoundingBox.BottomLeft.Y).ToList();

                for (int i = 0; i < orderedBlocks.Count; i++)
                {
                    Assert.Equal(expected[i], orderedBlocks[i].Text);
                }
            }
        }