Ejemplo n.º 1
0
        /// <summary>
        /// Get the blocks.
        /// </summary>
        /// <param name="pagePaths">The words in the page.</param>
        /// <param name="minimumWidth">The minimum width for a block.</param>
        /// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param>
        /// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param>
        public IReadOnlyList <PdfRectangle> GetBlocks(IEnumerable <PdfPath> pagePaths, decimal minimumWidth,
                                                      decimal dominantFontWidth, decimal dominantFontHeight)
        {
            pagePaths = pagePaths.Where(p => p != null && p.Commands.Count > 0); // clean paths
            XYLeafP root = new XYLeafP(pagePaths);                               // Create a root node.
            XYNodeP node = VerticalCut(root, minimumWidth, dominantFontWidth, dominantFontHeight);

            var leafs = node.GetLeafs();

            if (leafs.Count > 0)
            {
                return(leafs.Select(l => l.BoundingBox).ToList()); // new TextBlock(l.GetLines())).ToList();
            }

            return(new List <PdfRectangle>());
        }
Ejemplo n.º 2
0
        private XYNodeP VerticalCut(XYLeafP leaf, decimal minimumWidth,
                                    decimal dominantFontWidth, decimal dominantFontHeight, int level = 0)
        {
            if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth)
            {
                // we stop cutting if
                // - only one word remains
                // - width is too small
                return(leaf);
            }

            // order words left to right
            var paths = leaf.Paths.OrderBy(w => w.BoundingBox().Left).ToArray();

            // determine dominantFontWidth and dominantFontHeight
            //decimal domFontWidth = dominantFontWidthFunc(paths.SelectMany(x => x.Letters)
            //    .Select(x => Math.Abs(x.GlyphRectangle.Width)));
            //decimal domFontHeight = dominantFontHeightFunc(paths.SelectMany(x => x.Letters)
            //    .Select(x => Math.Abs(x.GlyphRectangle.Height)));

            List <decimal[]> projectionProfile = new List <decimal[]>();

            decimal[] currentProj = new decimal[2] {
                paths[0].BoundingBox().Left, paths[0].BoundingBox().Right
            };
            int wordsCount = paths.Count();

            for (int i = 1; i < wordsCount; i++)
            {
                if ((paths[i].BoundingBox().Left >= currentProj[0] && paths[i].BoundingBox().Left <= currentProj[1]) ||
                    (paths[i].BoundingBox().Right >= currentProj[0] && paths[i].BoundingBox().Right <= currentProj[1]))
                {
                    // it is overlapping
                    if (paths[i].BoundingBox().Left >= currentProj[0] &&
                        paths[i].BoundingBox().Left <= currentProj[1] &&
                        paths[i].BoundingBox().Right > currentProj[1])
                    {
                        // |____|
                        //    |____|
                        // |_______|    <- updated
                        currentProj[1] = paths[i].BoundingBox().Right;
                    }

                    // we ignore the following cases:
                    //    |____|
                    // |____|          (not possible because of OrderBy)
                    //
                    //    |____|
                    //|___________|    (not possible because of OrderBy)
                    //
                    //  |____|
                    //   |_|
                }
                else
                {
                    // no overlap
                    if (paths[i].BoundingBox().Left - currentProj[1] <= dominantFontWidth)
                    {
                        // if gap too small -> don't cut
                        // |____| |____|
                        currentProj[1] = paths[i].BoundingBox().Right;
                    }
                    else if (currentProj[1] - currentProj[0] < minimumWidth)
                    {
                        // still too small
                        currentProj[1] = paths[i].BoundingBox().Right;
                    }
                    else
                    {
                        // if gap big enough -> cut!
                        // |____|   |   |____|
                        if (i != wordsCount - 1) // will always add the last one after
                        {
                            projectionProfile.Add(currentProj);
                            currentProj = new decimal[2] {
                                paths[i].BoundingBox().Left, paths[i].BoundingBox().Right
                            };
                        }
                    }
                }
                if (i == wordsCount - 1)
                {
                    projectionProfile.Add(currentProj);
                }
            }

            var newLeafsEnums = projectionProfile
                                .Select(p => leaf.Paths.Where(w => w.BoundingBox().Left >= p[0] && w.BoundingBox().Right <= p[1]));
            var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeafP(e));

            var newNodes = newLeafs.Select(l => HorizontalCut(l, minimumWidth,
                                                              dominantFontWidth, dominantFontHeight, level)).ToList();

            var lost = leaf.Paths.Except(newLeafsEnums.SelectMany(x => x)).ToList();

            if (lost.Count > 0)
            {
                newNodes.AddRange(lost.Select(w => new XYLeafP(w)));
            }

            return(new XYNodeP(newNodes));
        }
Ejemplo n.º 3
0
        private XYNodeP HorizontalCut(XYLeafP leaf, decimal minimumWidth,
                                      decimal dominantFontWidth, decimal dominantFontHeight, int level = 0)
        {
            if (leaf.CountWords() <= 1)
            {
                // we stop cutting if
                // - only one word remains
                return(leaf);
            }

            var words = leaf.Paths.OrderBy(w => w.BoundingBox().Bottom).ToArray(); // order bottom to top

            // determine dominantFontWidth and dominantFontHeight
            //decimal dominantFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)
            //    .Select(x => Math.Abs(x.GlyphRectangle.Width)));
            //decimal dominantFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters)
            //    .Select(x => Math.Abs(x.GlyphRectangle.Height)));

            List <decimal[]> projectionProfile = new List <decimal[]>();

            decimal[] currentProj = new decimal[2] {
                words[0].BoundingBox().Bottom, words[0].BoundingBox().Top
            };
            int wordsCount = words.Count();

            for (int i = 1; i < wordsCount; i++)
            {
                if ((words[i].BoundingBox().Bottom >= currentProj[0] && words[i].BoundingBox().Bottom <= currentProj[1]) ||
                    (words[i].BoundingBox().Top >= currentProj[0] && words[i].BoundingBox().Top <= currentProj[1]))
                {
                    // it is overlapping
                    if (words[i].BoundingBox().Bottom >= currentProj[0] &&
                        words[i].BoundingBox().Bottom <= currentProj[1] &&
                        words[i].BoundingBox().Top > currentProj[1])
                    {
                        currentProj[1] = words[i].BoundingBox().Top;
                    }
                }
                else
                {
                    // no overlap
                    if (words[i].BoundingBox().Bottom - currentProj[1] <= dominantFontHeight)
                    {
                        // if gap too small -> don't cut
                        // |____| |____|
                        currentProj[1] = words[i].BoundingBox().Top;
                    }
                    else
                    {
                        // if gap big enough -> cut!
                        // |____|   |   |____|
                        if (i != wordsCount - 1) // will always add the last one after
                        {
                            projectionProfile.Add(currentProj);
                            currentProj = new decimal[2] {
                                words[i].BoundingBox().Bottom, words[i].BoundingBox().Top
                            };
                        }
                    }
                }
                if (i == wordsCount - 1)
                {
                    projectionProfile.Add(currentProj);
                }
            }

            if (projectionProfile.Count == 1)
            {
                if (level >= 1)
                {
                    return(leaf);
                }
                else
                {
                    level++;
                }
            }

            var newLeafsEnums = projectionProfile.Select(p =>
                                                         leaf.Paths.Where(w => w.BoundingBox().Bottom >= p[0] && w.BoundingBox().Top <= p[1]));
            var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeafP(e));
            var newNodes = newLeafs.Select(l => VerticalCut(l, minimumWidth,
                                                            dominantFontWidth, dominantFontHeight, level)).ToList();

            var lost = leaf.Paths.Except(newLeafsEnums.SelectMany(x => x)).ToList();

            if (lost.Count > 0)
            {
                newNodes.AddRange(lost.Select(w => new XYLeafP(w)));
            }
            return(new XYNodeP(newNodes));
        }