public void GetBlocks(string name, string[] expected)
        {
            if (name == "90 180 270 rotated.pdf")
            {
                // The 'TimesNewRomanPSMT' font is used by this particular document. Thus, results cannot be trusted on
                // platforms where this font isn't generally available (e.g. OSX, Linux, etc.), so we skip it!
                var font = SystemFontFinder.Instance.GetTrueTypeFont("TimesNewRomanPSMT");
                Skip.If(font == null, "Skipped because the font TimesNewRomanPSMT could not be found in the execution environment.");
            }

            var options = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions()
            {
                LineSeparator = " "
            };

            using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath(name)))
            {
                var page   = document.GetPage(1);
                var words  = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
                var blocks = DocstrumBoundingBoxes.Instance.GetBlocks(words, options);

                Assert.Equal(expected.Length, blocks.Count);
                var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X)
                                    .ThenByDescending(b => b.BoundingBox.BottomLeft.Y).ToList();

                for (int i = 0; i < orderedBlocks.Count; i++)
                {
                    Assert.Equal(expected[i], orderedBlocks[i].Text);
                }
            }
        }
Exemple #2
0
        public void GetBlocks(string name, string[] expected)
        {
            var options = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions()
            {
                LineSeparator = " "
            };

            using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath(name)))
            {
                var page   = document.GetPage(1);
                var words  = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters);
                var blocks = DocstrumBoundingBoxes.Instance.GetBlocks(words, options);

                Assert.Equal(expected.Length, blocks.Count);
                var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X)
                                    .ThenByDescending(b => b.BoundingBox.BottomLeft.Y).ToList();

                for (int i = 0; i < orderedBlocks.Count; i++)
                {
                    Assert.Equal(expected[i], orderedBlocks[i].Text);
                }
            }
        }
        public void GetBlocksStatic(string name, string[] expected)
        {
            if (name == "90 180 270 rotated.pdf")
            {
                // The 'TimesNewRomanPSMT' font is used by this particular document. Thus, results cannot be trusted on
                // platforms where this font isn't generally available (e.g. OSX, Linux, etc.), so we skip it!
                var font = SystemFontFinder.Instance.GetTrueTypeFont("TimesNewRomanPSMT");
                Skip.If(font == null, "Skipped because the font TimesNewRomanPSMT could not be found in the execution environment.");
            }

            var options = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions()
            {
                LineSeparator = " "
            };

            using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath(name)))
            {
                var page  = document.GetPage(1);
                var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters).ToList();

                // Docstrum using static methods
                // Filter out white spaces
                words = words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).ToList();

                var wlBounds                = options.WithinLineBounds;
                var wlBinSize               = options.WithinLineBinSize;
                var wlMultiplier            = options.WithinLineMultiplier;
                var blBounds                = options.BetweenLineBounds;
                var blBinSize               = options.BetweenLineBinSize;
                var blMultiplier            = options.BetweenLineMultiplier;
                var maxDegreeOfParallelism  = options.MaxDegreeOfParallelism;
                var angularDifferenceBounds = options.AngularDifferenceBounds;
                var wordSeparator           = options.WordSeparator;
                var lineSeparator           = options.LineSeparator;
                var epsilon = options.Epsilon;

                // 1. Estimate within line and between line spacing
                if (!DocstrumBoundingBoxes.GetSpacingEstimation(words, wlBounds, wlBinSize, blBounds, blBinSize,
                                                                maxDegreeOfParallelism,
                                                                out double withinLineDistance, out double betweenLineDistance))
                {
                    if (double.IsNaN(withinLineDistance))
                    {
                        withinLineDistance = 0;
                    }

                    if (double.IsNaN(betweenLineDistance))
                    {
                        betweenLineDistance = 0;
                    }
                }

                // 2. Determination of Text Lines
                double maxWithinLineDistance = wlMultiplier * withinLineDistance;
                var    lines = DocstrumBoundingBoxes.GetLines(words, maxWithinLineDistance, wlBounds, wordSeparator, maxDegreeOfParallelism).ToArray();

                // 3. Structural Block Determination
                double maxBetweenLineDistance = blMultiplier * betweenLineDistance;
                var    blocks = DocstrumBoundingBoxes.GetStructuralBlocks(lines, maxBetweenLineDistance, angularDifferenceBounds, epsilon, lineSeparator, maxDegreeOfParallelism).ToList();

                Assert.Equal(expected.Length, blocks.Count);
                var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X)
                                    .ThenByDescending(b => b.BoundingBox.BottomLeft.Y).ToList();

                for (int i = 0; i < orderedBlocks.Count; i++)
                {
                    Assert.Equal(expected[i], orderedBlocks[i].Text);
                }
            }
        }
        public static void Run(string filePath)
        {
            var sb = new StringBuilder();

            using (var document = PdfDocument.Open(filePath))
            {
                foreach (var page in document.GetPages())
                {
                    // 0. Preprocessing
                    var letters = page.Letters; // no preprocessing

                    // 1. Extract words
                    var wordExtractor        = NearestNeighbourWordExtractor.Instance;
                    var wordExtractorOptions = new NearestNeighbourWordExtractor.NearestNeighbourWordExtractorOptions()
                    {
                        Filter = (pivot, candidate) =>
                        {
                            // check if white space (default implementation of 'Filter')
                            if (string.IsNullOrWhiteSpace(candidate.Value))
                            {
                                // pivot and candidate letters cannot belong to the same word
                                // if candidate letter is null or white space.
                                // ('FilterPivot' already checks if the pivot is null or white space by default)
                                return(false);
                            }

                            // check for height difference
                            var maxHeight = Math.Max(pivot.PointSize, candidate.PointSize);
                            var minHeight = Math.Min(pivot.PointSize, candidate.PointSize);
                            if (minHeight != 0 && maxHeight / minHeight > 2.0)
                            {
                                // pivot and candidate letters cannot belong to the same word
                                // if one letter is more than twice the size of the other.
                                return(false);
                            }

                            // check for colour difference
                            var pivotRgb     = pivot.Color.ToRGBValues();
                            var candidateRgb = candidate.Color.ToRGBValues();
                            if (!pivotRgb.Equals(candidateRgb))
                            {
                                // pivot and candidate letters cannot belong to the same word
                                // if they don't have the same colour.
                                return(false);
                            }

                            return(true);
                        }
                    };

                    var words = wordExtractor.GetWords(letters, wordExtractorOptions);

                    // 2. Segment page
                    var pageSegmenter        = DocstrumBoundingBoxes.Instance;
                    var pageSegmenterOptions = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions()
                    {
                    };

                    var textBlocks = pageSegmenter.GetBlocks(words, pageSegmenterOptions);

                    // 3. Postprocessing
                    var readingOrder      = UnsupervisedReadingOrderDetector.Instance;
                    var orderedTextBlocks = readingOrder.Get(textBlocks);

                    // 4. Extract text
                    foreach (var block in orderedTextBlocks)
                    {
                        sb.Append(block.Text.Normalize(NormalizationForm.FormKC)); // normalise text
                        sb.AppendLine();
                    }

                    sb.AppendLine();
                }
            }

            Console.WriteLine(sb.ToString());
        }