Example #1
0
        /// <summary>
        /// Get the blocks.
        /// </summary>
        /// <param name="words">The words in the page.</param>
        /// <param name="minimumWidth">The minimum width for a block.</param>
        /// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param>
        /// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param>
        /// <param name="wordSeparator"></param>
        /// <param name="lineSeparator"></param>
        private IReadOnlyList <TextBlock> GetBlocks(IEnumerable <Word> words, double minimumWidth,
                                                    Func <IEnumerable <Letter>, double> dominantFontWidthFunc,
                                                    Func <IEnumerable <Letter>, double> dominantFontHeightFunc,
                                                    string wordSeparator, string lineSeparator)
        {
            // Filter out white spaces
            words = words.Where(w => !string.IsNullOrWhiteSpace(w.Text));
            if (!words.Any())
            {
                return(EmptyArray <TextBlock> .Instance);
            }

            XYLeaf root = new XYLeaf(words); // Create a root node.
            XYNode node = VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);

            if (node.IsLeaf)
            {
                return(new List <TextBlock> {
                    new TextBlock((node as XYLeaf).GetLines(wordSeparator), lineSeparator)
                });
            }
            else
            {
                var leaves = node.GetLeaves();

                if (leaves.Count > 0)
                {
                    return(leaves.ConvertAll(l => new TextBlock(l.GetLines(wordSeparator), lineSeparator)));
                }
            }

            return(new List <TextBlock>());
        }
Example #2
0
        /// <summary>
        /// Get the blocks.
        /// </summary>
        /// <param name="pageWords">The words in the page.</param>
        /// <param name="minimumWidth">The minimum width for a block.</param>
        /// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param>
        /// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param>
        public IReadOnlyList <TextBlock> GetBlocks(IEnumerable <Word> pageWords, double minimumWidth,
                                                   Func <IEnumerable <Letter>, double> dominantFontWidthFunc,
                                                   Func <IEnumerable <Letter>, double> dominantFontHeightFunc)
        {
            if (pageWords.Count() == 0)
            {
                return(EmptyArray <TextBlock> .Instance);
            }

            XYLeaf root = new XYLeaf(pageWords); // Create a root node.
            XYNode node = VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc);

            if (node.IsLeaf)
            {
                return(new List <TextBlock> {
                    new TextBlock((node as XYLeaf).GetLines())
                });
            }
            else
            {
                var leaves = node.GetLeaves();

                if (leaves.Count > 0)
                {
                    return(leaves.Select(l => new TextBlock(l.GetLines())).ToList());
                }
            }

            return(new List <TextBlock>());
        }
Example #3
0
        private XYNode HorizontalCut(XYLeaf leaf, double minimumWidth,
                                     Func <IEnumerable <Letter>, double> dominantFontWidthFunc,
                                     Func <IEnumerable <Letter>, double> dominantFontHeightFunc, int level = 0)
        {
            // Order words bottom to top
            var words = leaf.Words.OrderBy(w => w.BoundingBox.Normalise().Bottom).ToArray();

            if (words.Length == 0)
            {
                return(new XYNode(null));
            }

            // Create new leaf with non-whitespace words.
            leaf = new XYLeaf(words);

            if (leaf.CountWords() <= 1)
            {
                // We stop cutting if
                // - only one word remains
                return(leaf);
            }

            // Determine dominant font height
            double dominantFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters));

            List <Projection> projectionProfile = new List <Projection>();

            var        firstWordBound    = words[0].BoundingBox.Normalise();
            Projection currentProjection = new Projection(firstWordBound.Bottom, firstWordBound.Top);
            int        wordsCount        = words.Length;

            for (int i = 1; i < wordsCount; i++)
            {
                var currentWordBound = words[i].BoundingBox.Normalise();

                if (currentProjection.Contains(currentWordBound.Bottom) || currentProjection.Contains(currentWordBound.Top))
                {
                    // It is overlapping
                    if (currentWordBound.Bottom >= currentProjection.LowerBound &&
                        currentWordBound.Bottom <= currentProjection.UpperBound &&
                        currentWordBound.Top > currentProjection.UpperBound)
                    {
                        currentProjection.UpperBound = currentWordBound.Top;
                    }
                }
                else
                {
                    // No overlap
                    if (currentWordBound.Bottom - currentProjection.UpperBound <= dominantFontHeight)
                    {
                        // If gap too small -> don't cut
                        // |____| |____|
                        currentProjection.UpperBound = currentWordBound.Top;
                    }
                    else
                    {
                        // If gap big enough -> cut!
                        // |____|   |   |____|
                        if (i != wordsCount - 1) // Will always add the last one after
                        {
                            projectionProfile.Add(currentProjection);
                            currentProjection = new Projection(currentWordBound.Bottom, currentWordBound.Top);
                        }
                    }
                }

                if (i == wordsCount - 1)
                {
                    projectionProfile.Add(currentProjection);
                }
            }

            if (projectionProfile.Count == 1)
            {
                if (level >= 1)
                {
                    return(leaf);
                }
                else
                {
                    level++;
                }
            }

            var newLeavesEnums = projectionProfile.Select(p => leaf.Words.Where(w =>
            {
                // Get words that are contained in each projection profiles
                var normalisedBB = w.BoundingBox.Normalise();
                return(normalisedBB.Bottom >= p.LowerBound && normalisedBB.Top <= p.UpperBound);
            }));

            var newLeaves = newLeavesEnums.Where(e => e.Any()).Select(e => new XYLeaf(e));
            var newNodes  = newLeaves.Select(l => VerticalCut(l, minimumWidth,
                                                              dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();

            var lost = leaf.Words.Except(newLeavesEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();

            if (lost.Count > 0)
            {
                newNodes.AddRange(lost.Select(w => new XYLeaf(w)));
            }
            return(new XYNode(newNodes));
        }
Example #4
0
        private XYNode VerticalCut(XYLeaf leaf, double minimumWidth,
                                   Func <IEnumerable <Letter>, double> dominantFontWidthFunc,
                                   Func <IEnumerable <Letter>, double> dominantFontHeightFunc, int level = 0)
        {
            // Order words left to right
            var words = leaf.Words.OrderBy(w => w.BoundingBox.Normalise().Left).ToArray();

            if (words.Length == 0)
            {
                return(new XYNode(null));
            }

            // Create new leaf with non-whitespace words.
            leaf = new XYLeaf(words);

            if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth)
            {
                // We stop cutting if
                // - only one word remains
                // - width is too small
                return(leaf);
            }

            // Determine dominant font width
            double dominantFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters));

            List <Projection> projectionProfile = new List <Projection>();

            var        firstWordBound    = words[0].BoundingBox.Normalise();
            Projection currentProjection = new Projection(firstWordBound.Left, firstWordBound.Right);
            int        wordsCount        = words.Length;

            for (int i = 1; i < wordsCount; i++)
            {
                var currentWordBound = words[i].BoundingBox.Normalise();

                if (currentProjection.Contains(currentWordBound.Left) || currentProjection.Contains(currentWordBound.Right))
                {
                    // It is overlapping
                    if (currentWordBound.Left >= currentProjection.LowerBound &&
                        currentWordBound.Left <= currentProjection.UpperBound &&
                        currentWordBound.Right > currentProjection.UpperBound)
                    {
                        // |____|
                        //    |____|
                        // |_______|    <- updated
                        currentProjection.UpperBound = currentWordBound.Right;
                    }

                    // We ignore the following cases:
                    //    |____|
                    // |____|          (not possible because of OrderBy)
                    //
                    //    |____|
                    //|___________|    (not possible because of OrderBy)
                    //
                    //  |____|
                    //   |_|
                }
                else
                {
                    // No overlap
                    if (currentWordBound.Left - currentProjection.UpperBound <= dominantFontWidth)
                    {
                        // If gap too small -> don't cut
                        // |____| |____|
                        currentProjection.UpperBound = currentWordBound.Right;
                    }
                    else if (currentProjection.UpperBound - currentProjection.LowerBound < minimumWidth)
                    {
                        // Still too small
                        currentProjection.UpperBound = currentWordBound.Right;
                    }
                    else
                    {
                        // If gap big enough -> cut!
                        // |____|   |   |____|
                        if (i != wordsCount - 1) // Will always add the last one after
                        {
                            projectionProfile.Add(currentProjection);
                            currentProjection = new Projection(currentWordBound.Left, currentWordBound.Right);
                        }
                    }
                }

                if (i == wordsCount - 1)
                {
                    projectionProfile.Add(currentProjection);
                }
            }

            var newLeavesEnums = projectionProfile.Select(p => leaf.Words.Where(w =>
            {
                // Get words that are contained in each projection profiles
                var normalisedBB = w.BoundingBox.Normalise();
                return(normalisedBB.Left >= p.LowerBound && normalisedBB.Right <= p.UpperBound);
            }));

            var newLeaves = newLeavesEnums.Where(e => e.Any()).Select(e => new XYLeaf(e));
            var newNodes  = newLeaves.Select(l => HorizontalCut(l, minimumWidth,
                                                                dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();

            var lost = leaf.Words.Except(newLeavesEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();

            if (lost.Count > 0)
            {
                newNodes.AddRange(lost.Select(w => new XYLeaf(w)));
            }

            return(new XYNode(newNodes));
        }
Example #5
0
        private XYNode VerticalCut(XYLeaf leaf, double minimumWidth,
                                   Func <IEnumerable <double>, double> dominantFontWidthFunc,
                                   Func <IEnumerable <double>, double> dominantFontHeightFunc, int level = 0)
        {
            // order words left to right
            var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Left).ToArray();

            if (!words.Any())
            {
                return(new XYNode(null));
            }
            else
            {
                //Create new leaf with non-whitespace words.
                leaf = new XYLeaf(words);
            }

            if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth)
            {
                // we stop cutting if
                // - only one word remains
                // - width is too small
                return(leaf);
            }

            // determine dominantFontWidth and dominantFontHeight
            double domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
                                                        .Select(x => Math.Abs(x.GlyphRectangle.Width)));
            double domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters)
                                                          .Select(x => Math.Abs(x.GlyphRectangle.Height)));

            List <double[]> projectionProfile = new List <double[]>();

            double[] currentProj = new double[2] {
                words[0].BoundingBox.Left, words[0].BoundingBox.Right
            };
            int wordsCount = words.Count();

            for (int i = 1; i < wordsCount; i++)
            {
                if ((words[i].BoundingBox.Left >= currentProj[0] && words[i].BoundingBox.Left <= currentProj[1]) ||
                    (words[i].BoundingBox.Right >= currentProj[0] && words[i].BoundingBox.Right <= currentProj[1]))
                {
                    // it is overlapping
                    if (words[i].BoundingBox.Left >= currentProj[0] &&
                        words[i].BoundingBox.Left <= currentProj[1] &&
                        words[i].BoundingBox.Right > currentProj[1])
                    {
                        // |____|
                        //    |____|
                        // |_______|    <- updated
                        currentProj[1] = words[i].BoundingBox.Right;
                    }

                    // we ignore the following cases:
                    //    |____|
                    // |____|          (not possible because of OrderBy)
                    //
                    //    |____|
                    //|___________|    (not possible because of OrderBy)
                    //
                    //  |____|
                    //   |_|
                }
                else
                {
                    // no overlap
                    if (words[i].BoundingBox.Left - currentProj[1] <= domFontWidth)
                    {
                        // if gap too small -> don't cut
                        // |____| |____|
                        currentProj[1] = words[i].BoundingBox.Right;
                    }
                    else if (currentProj[1] - currentProj[0] < minimumWidth)
                    {
                        // still too small
                        currentProj[1] = words[i].BoundingBox.Right;
                    }
                    else
                    {
                        // if gap big enough -> cut!
                        // |____|   |   |____|
                        if (i != wordsCount - 1) // will always add the last one after
                        {
                            projectionProfile.Add(currentProj);
                            currentProj = new double[2] {
                                words[i].BoundingBox.Left, words[i].BoundingBox.Right
                            };
                        }
                    }
                }
                if (i == wordsCount - 1)
                {
                    projectionProfile.Add(currentProj);
                }
            }

            var newLeafsEnums = projectionProfile.Select(p => leaf.Words.Where(w => w.BoundingBox.Left >= p[0] && w.BoundingBox.Right <= p[1]));
            var newLeafs      = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e));

            var newNodes = newLeafs.Select(l => HorizontalCut(l, minimumWidth,
                                                              dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();

            var lost = leaf.Words.Except(newLeafsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();

            if (lost.Count > 0)
            {
                newNodes.AddRange(lost.Select(w => new XYLeaf(w)));
            }

            return(new XYNode(newNodes));
        }
Example #6
0
        private XYNode HorizontalCut(XYLeaf leaf, double minimumWidth,
                                     Func <IEnumerable <double>, double> dominantFontWidthFunc,
                                     Func <IEnumerable <double>, double> dominantFontHeightFunc, int level = 0)
        {
            var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Bottom).ToArray(); // order bottom to top

            if (!words.Any())
            {
                return(new XYNode(null));
            }

            //Create new leaf with non-whitespace words.
            leaf = new XYLeaf(words);

            if (leaf.CountWords() <= 1)
            {
                // we stop cutting if
                // - only one word remains
                return(leaf);
            }

            // determine dominantFontWidth and dominantFontHeight
            double domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
                                                        .Select(x => Math.Abs(x.GlyphRectangle.Width)));
            double domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters)
                                                          .Select(x => Math.Abs(x.GlyphRectangle.Height)));

            List <double[]> projectionProfile = new List <double[]>();

            double[] currentProj = new double[2] {
                words[0].BoundingBox.Bottom, words[0].BoundingBox.Top
            };
            int wordsCount = words.Count();

            for (int i = 1; i < wordsCount; i++)
            {
                if ((words[i].BoundingBox.Bottom >= currentProj[0] && words[i].BoundingBox.Bottom <= currentProj[1]) ||
                    (words[i].BoundingBox.Top >= currentProj[0] && words[i].BoundingBox.Top <= currentProj[1]))
                {
                    // it is overlapping
                    if (words[i].BoundingBox.Bottom >= currentProj[0] &&
                        words[i].BoundingBox.Bottom <= currentProj[1] &&
                        words[i].BoundingBox.Top > currentProj[1])
                    {
                        currentProj[1] = words[i].BoundingBox.Top;
                    }
                }
                else
                {
                    // no overlap
                    if (words[i].BoundingBox.Bottom - currentProj[1] <= domFontHeight)
                    {
                        // if gap too small -> don't cut
                        // |____| |____|
                        currentProj[1] = words[i].BoundingBox.Top;
                    }
                    else
                    {
                        // if gap big enough -> cut!
                        // |____|   |   |____|
                        if (i != wordsCount - 1) // will always add the last one after
                        {
                            projectionProfile.Add(currentProj);
                            currentProj = new double[2] {
                                words[i].BoundingBox.Bottom, words[i].BoundingBox.Top
                            };
                        }
                    }
                }
                if (i == wordsCount - 1)
                {
                    projectionProfile.Add(currentProj);
                }
            }

            if (projectionProfile.Count == 1)
            {
                if (level >= 1)
                {
                    return(leaf);
                }
                else
                {
                    level++;
                }
            }

            var newLeafsEnums = projectionProfile.Select(p =>
                                                         leaf.Words.Where(w => w.BoundingBox.Bottom >= p[0] && w.BoundingBox.Top <= p[1]));
            var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e));
            var newNodes = newLeafs.Select(l => VerticalCut(l, minimumWidth,
                                                            dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList();

            var lost = leaf.Words.Except(newLeafsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList();

            if (lost.Count > 0)
            {
                newNodes.AddRange(lost.Select(w => new XYLeaf(w)));
            }
            return(new XYNode(newNodes));
        }