/// <summary> /// Get the blocks. /// </summary> /// <param name="pagePaths">The words in the page.</param> /// <param name="minimumWidth">The minimum width for a block.</param> /// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param> /// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param> public IReadOnlyList <PdfRectangle> GetBlocks(IEnumerable <PdfPath> pagePaths, decimal minimumWidth, decimal dominantFontWidth, decimal dominantFontHeight) { pagePaths = pagePaths.Where(p => p != null && p.Commands.Count > 0); // clean paths XYLeafP root = new XYLeafP(pagePaths); // Create a root node. XYNodeP node = VerticalCut(root, minimumWidth, dominantFontWidth, dominantFontHeight); var leafs = node.GetLeafs(); if (leafs.Count > 0) { return(leafs.Select(l => l.BoundingBox).ToList()); // new TextBlock(l.GetLines())).ToList(); } return(new List <PdfRectangle>()); }
private XYNodeP VerticalCut(XYLeafP leaf, decimal minimumWidth, decimal dominantFontWidth, decimal dominantFontHeight, int level = 0) { if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth) { // we stop cutting if // - only one word remains // - width is too small return(leaf); } // order words left to right var paths = leaf.Paths.OrderBy(w => w.BoundingBox().Left).ToArray(); // determine dominantFontWidth and dominantFontHeight //decimal domFontWidth = dominantFontWidthFunc(paths.SelectMany(x => x.Letters) // .Select(x => Math.Abs(x.GlyphRectangle.Width))); //decimal domFontHeight = dominantFontHeightFunc(paths.SelectMany(x => x.Letters) // .Select(x => Math.Abs(x.GlyphRectangle.Height))); List <decimal[]> projectionProfile = new List <decimal[]>(); decimal[] currentProj = new decimal[2] { paths[0].BoundingBox().Left, paths[0].BoundingBox().Right }; int wordsCount = paths.Count(); for (int i = 1; i < wordsCount; i++) { if ((paths[i].BoundingBox().Left >= currentProj[0] && paths[i].BoundingBox().Left <= currentProj[1]) || (paths[i].BoundingBox().Right >= currentProj[0] && paths[i].BoundingBox().Right <= currentProj[1])) { // it is overlapping if (paths[i].BoundingBox().Left >= currentProj[0] && paths[i].BoundingBox().Left <= currentProj[1] && paths[i].BoundingBox().Right > currentProj[1]) { // |____| // |____| // |_______| <- updated currentProj[1] = paths[i].BoundingBox().Right; } // we ignore the following cases: // |____| // |____| (not possible because of OrderBy) // // |____| //|___________| (not possible because of OrderBy) // // |____| // |_| } else { // no overlap if (paths[i].BoundingBox().Left - currentProj[1] <= dominantFontWidth) { // if gap too small -> don't cut // |____| |____| currentProj[1] = paths[i].BoundingBox().Right; } else if (currentProj[1] - currentProj[0] < minimumWidth) { // still too small currentProj[1] = paths[i].BoundingBox().Right; } else { // if gap big enough -> cut! // |____| | |____| if (i != wordsCount - 1) // will always add the last one after { projectionProfile.Add(currentProj); currentProj = new decimal[2] { paths[i].BoundingBox().Left, paths[i].BoundingBox().Right }; } } } if (i == wordsCount - 1) { projectionProfile.Add(currentProj); } } var newLeafsEnums = projectionProfile .Select(p => leaf.Paths.Where(w => w.BoundingBox().Left >= p[0] && w.BoundingBox().Right <= p[1])); var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeafP(e)); var newNodes = newLeafs.Select(l => HorizontalCut(l, minimumWidth, dominantFontWidth, dominantFontHeight, level)).ToList(); var lost = leaf.Paths.Except(newLeafsEnums.SelectMany(x => x)).ToList(); if (lost.Count > 0) { newNodes.AddRange(lost.Select(w => new XYLeafP(w))); } return(new XYNodeP(newNodes)); }
private XYNodeP HorizontalCut(XYLeafP leaf, decimal minimumWidth, decimal dominantFontWidth, decimal dominantFontHeight, int level = 0) { if (leaf.CountWords() <= 1) { // we stop cutting if // - only one word remains return(leaf); } var words = leaf.Paths.OrderBy(w => w.BoundingBox().Bottom).ToArray(); // order bottom to top // determine dominantFontWidth and dominantFontHeight //decimal dominantFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters) // .Select(x => Math.Abs(x.GlyphRectangle.Width))); //decimal dominantFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters) // .Select(x => Math.Abs(x.GlyphRectangle.Height))); List <decimal[]> projectionProfile = new List <decimal[]>(); decimal[] currentProj = new decimal[2] { words[0].BoundingBox().Bottom, words[0].BoundingBox().Top }; int wordsCount = words.Count(); for (int i = 1; i < wordsCount; i++) { if ((words[i].BoundingBox().Bottom >= currentProj[0] && words[i].BoundingBox().Bottom <= currentProj[1]) || (words[i].BoundingBox().Top >= currentProj[0] && words[i].BoundingBox().Top <= currentProj[1])) { // it is overlapping if (words[i].BoundingBox().Bottom >= currentProj[0] && words[i].BoundingBox().Bottom <= currentProj[1] && words[i].BoundingBox().Top > currentProj[1]) { currentProj[1] = words[i].BoundingBox().Top; } } else { // no overlap if (words[i].BoundingBox().Bottom - currentProj[1] <= dominantFontHeight) { // if gap too small -> don't cut // |____| |____| currentProj[1] = words[i].BoundingBox().Top; } else { // if gap big enough -> cut! // |____| | |____| if (i != wordsCount - 1) // will always add the last one after { projectionProfile.Add(currentProj); currentProj = new decimal[2] { words[i].BoundingBox().Bottom, words[i].BoundingBox().Top }; } } } if (i == wordsCount - 1) { projectionProfile.Add(currentProj); } } if (projectionProfile.Count == 1) { if (level >= 1) { return(leaf); } else { level++; } } var newLeafsEnums = projectionProfile.Select(p => leaf.Paths.Where(w => w.BoundingBox().Bottom >= p[0] && w.BoundingBox().Top <= p[1])); var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeafP(e)); var newNodes = newLeafs.Select(l => VerticalCut(l, minimumWidth, dominantFontWidth, dominantFontHeight, level)).ToList(); var lost = leaf.Paths.Except(newLeafsEnums.SelectMany(x => x)).ToList(); if (lost.Count > 0) { newNodes.AddRange(lost.Select(w => new XYLeafP(w))); } return(new XYNodeP(newNodes)); }