/// <summary> /// Get the blocks. /// </summary> /// <param name="words">The words in the page.</param> /// <param name="minimumWidth">The minimum width for a block.</param> /// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param> /// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param> /// <param name="wordSeparator"></param> /// <param name="lineSeparator"></param> private IReadOnlyList <TextBlock> GetBlocks(IEnumerable <Word> words, double minimumWidth, Func <IEnumerable <Letter>, double> dominantFontWidthFunc, Func <IEnumerable <Letter>, double> dominantFontHeightFunc, string wordSeparator, string lineSeparator) { // Filter out white spaces words = words.Where(w => !string.IsNullOrWhiteSpace(w.Text)); if (!words.Any()) { return(EmptyArray <TextBlock> .Instance); } XYLeaf root = new XYLeaf(words); // Create a root node. XYNode node = VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc); if (node.IsLeaf) { return(new List <TextBlock> { new TextBlock((node as XYLeaf).GetLines(wordSeparator), lineSeparator) }); } else { var leaves = node.GetLeaves(); if (leaves.Count > 0) { return(leaves.ConvertAll(l => new TextBlock(l.GetLines(wordSeparator), lineSeparator))); } } return(new List <TextBlock>()); }
/// <summary> /// Get the blocks. /// </summary> /// <param name="pageWords">The words in the page.</param> /// <param name="minimumWidth">The minimum width for a block.</param> /// <param name="dominantFontWidthFunc">The function that determines the dominant font width.</param> /// <param name="dominantFontHeightFunc">The function that determines the dominant font height.</param> public IReadOnlyList <TextBlock> GetBlocks(IEnumerable <Word> pageWords, double minimumWidth, Func <IEnumerable <Letter>, double> dominantFontWidthFunc, Func <IEnumerable <Letter>, double> dominantFontHeightFunc) { if (pageWords.Count() == 0) { return(EmptyArray <TextBlock> .Instance); } XYLeaf root = new XYLeaf(pageWords); // Create a root node. XYNode node = VerticalCut(root, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc); if (node.IsLeaf) { return(new List <TextBlock> { new TextBlock((node as XYLeaf).GetLines()) }); } else { var leaves = node.GetLeaves(); if (leaves.Count > 0) { return(leaves.Select(l => new TextBlock(l.GetLines())).ToList()); } } return(new List <TextBlock>()); }
private XYNode HorizontalCut(XYLeaf leaf, double minimumWidth, Func <IEnumerable <Letter>, double> dominantFontWidthFunc, Func <IEnumerable <Letter>, double> dominantFontHeightFunc, int level = 0) { // Order words bottom to top var words = leaf.Words.OrderBy(w => w.BoundingBox.Normalise().Bottom).ToArray(); if (words.Length == 0) { return(new XYNode(null)); } // Create new leaf with non-whitespace words. leaf = new XYLeaf(words); if (leaf.CountWords() <= 1) { // We stop cutting if // - only one word remains return(leaf); } // Determine dominant font height double dominantFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters)); List <Projection> projectionProfile = new List <Projection>(); var firstWordBound = words[0].BoundingBox.Normalise(); Projection currentProjection = new Projection(firstWordBound.Bottom, firstWordBound.Top); int wordsCount = words.Length; for (int i = 1; i < wordsCount; i++) { var currentWordBound = words[i].BoundingBox.Normalise(); if (currentProjection.Contains(currentWordBound.Bottom) || currentProjection.Contains(currentWordBound.Top)) { // It is overlapping if (currentWordBound.Bottom >= currentProjection.LowerBound && currentWordBound.Bottom <= currentProjection.UpperBound && currentWordBound.Top > currentProjection.UpperBound) { currentProjection.UpperBound = currentWordBound.Top; } } else { // No overlap if (currentWordBound.Bottom - currentProjection.UpperBound <= dominantFontHeight) { // If gap too small -> don't cut // |____| |____| currentProjection.UpperBound = currentWordBound.Top; } else { // If gap big enough -> cut! // |____| | |____| if (i != wordsCount - 1) // Will always add the last one after { projectionProfile.Add(currentProjection); currentProjection = new Projection(currentWordBound.Bottom, currentWordBound.Top); } } } if (i == wordsCount - 1) { projectionProfile.Add(currentProjection); } } if (projectionProfile.Count == 1) { if (level >= 1) { return(leaf); } else { level++; } } var newLeavesEnums = projectionProfile.Select(p => leaf.Words.Where(w => { // Get words that are contained in each projection profiles var normalisedBB = w.BoundingBox.Normalise(); return(normalisedBB.Bottom >= p.LowerBound && normalisedBB.Top <= p.UpperBound); })); var newLeaves = newLeavesEnums.Where(e => e.Any()).Select(e => new XYLeaf(e)); var newNodes = newLeaves.Select(l => VerticalCut(l, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); var lost = leaf.Words.Except(newLeavesEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); if (lost.Count > 0) { newNodes.AddRange(lost.Select(w => new XYLeaf(w))); } return(new XYNode(newNodes)); }
private XYNode VerticalCut(XYLeaf leaf, double minimumWidth, Func <IEnumerable <Letter>, double> dominantFontWidthFunc, Func <IEnumerable <Letter>, double> dominantFontHeightFunc, int level = 0) { // Order words left to right var words = leaf.Words.OrderBy(w => w.BoundingBox.Normalise().Left).ToArray(); if (words.Length == 0) { return(new XYNode(null)); } // Create new leaf with non-whitespace words. leaf = new XYLeaf(words); if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth) { // We stop cutting if // - only one word remains // - width is too small return(leaf); } // Determine dominant font width double dominantFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters)); List <Projection> projectionProfile = new List <Projection>(); var firstWordBound = words[0].BoundingBox.Normalise(); Projection currentProjection = new Projection(firstWordBound.Left, firstWordBound.Right); int wordsCount = words.Length; for (int i = 1; i < wordsCount; i++) { var currentWordBound = words[i].BoundingBox.Normalise(); if (currentProjection.Contains(currentWordBound.Left) || currentProjection.Contains(currentWordBound.Right)) { // It is overlapping if (currentWordBound.Left >= currentProjection.LowerBound && currentWordBound.Left <= currentProjection.UpperBound && currentWordBound.Right > currentProjection.UpperBound) { // |____| // |____| // |_______| <- updated currentProjection.UpperBound = currentWordBound.Right; } // We ignore the following cases: // |____| // |____| (not possible because of OrderBy) // // |____| //|___________| (not possible because of OrderBy) // // |____| // |_| } else { // No overlap if (currentWordBound.Left - currentProjection.UpperBound <= dominantFontWidth) { // If gap too small -> don't cut // |____| |____| currentProjection.UpperBound = currentWordBound.Right; } else if (currentProjection.UpperBound - currentProjection.LowerBound < minimumWidth) { // Still too small currentProjection.UpperBound = currentWordBound.Right; } else { // If gap big enough -> cut! // |____| | |____| if (i != wordsCount - 1) // Will always add the last one after { projectionProfile.Add(currentProjection); currentProjection = new Projection(currentWordBound.Left, currentWordBound.Right); } } } if (i == wordsCount - 1) { projectionProfile.Add(currentProjection); } } var newLeavesEnums = projectionProfile.Select(p => leaf.Words.Where(w => { // Get words that are contained in each projection profiles var normalisedBB = w.BoundingBox.Normalise(); return(normalisedBB.Left >= p.LowerBound && normalisedBB.Right <= p.UpperBound); })); var newLeaves = newLeavesEnums.Where(e => e.Any()).Select(e => new XYLeaf(e)); var newNodes = newLeaves.Select(l => HorizontalCut(l, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); var lost = leaf.Words.Except(newLeavesEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); if (lost.Count > 0) { newNodes.AddRange(lost.Select(w => new XYLeaf(w))); } return(new XYNode(newNodes)); }
private XYNode VerticalCut(XYLeaf leaf, double minimumWidth, Func <IEnumerable <double>, double> dominantFontWidthFunc, Func <IEnumerable <double>, double> dominantFontHeightFunc, int level = 0) { // order words left to right var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Left).ToArray(); if (!words.Any()) { return(new XYNode(null)); } else { //Create new leaf with non-whitespace words. leaf = new XYLeaf(words); } if (leaf.CountWords() <= 1 || leaf.BoundingBox.Width <= minimumWidth) { // we stop cutting if // - only one word remains // - width is too small return(leaf); } // determine dominantFontWidth and dominantFontHeight double domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters) .Select(x => Math.Abs(x.GlyphRectangle.Width))); double domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters) .Select(x => Math.Abs(x.GlyphRectangle.Height))); List <double[]> projectionProfile = new List <double[]>(); double[] currentProj = new double[2] { words[0].BoundingBox.Left, words[0].BoundingBox.Right }; int wordsCount = words.Count(); for (int i = 1; i < wordsCount; i++) { if ((words[i].BoundingBox.Left >= currentProj[0] && words[i].BoundingBox.Left <= currentProj[1]) || (words[i].BoundingBox.Right >= currentProj[0] && words[i].BoundingBox.Right <= currentProj[1])) { // it is overlapping if (words[i].BoundingBox.Left >= currentProj[0] && words[i].BoundingBox.Left <= currentProj[1] && words[i].BoundingBox.Right > currentProj[1]) { // |____| // |____| // |_______| <- updated currentProj[1] = words[i].BoundingBox.Right; } // we ignore the following cases: // |____| // |____| (not possible because of OrderBy) // // |____| //|___________| (not possible because of OrderBy) // // |____| // |_| } else { // no overlap if (words[i].BoundingBox.Left - currentProj[1] <= domFontWidth) { // if gap too small -> don't cut // |____| |____| currentProj[1] = words[i].BoundingBox.Right; } else if (currentProj[1] - currentProj[0] < minimumWidth) { // still too small currentProj[1] = words[i].BoundingBox.Right; } else { // if gap big enough -> cut! // |____| | |____| if (i != wordsCount - 1) // will always add the last one after { projectionProfile.Add(currentProj); currentProj = new double[2] { words[i].BoundingBox.Left, words[i].BoundingBox.Right }; } } } if (i == wordsCount - 1) { projectionProfile.Add(currentProj); } } var newLeafsEnums = projectionProfile.Select(p => leaf.Words.Where(w => w.BoundingBox.Left >= p[0] && w.BoundingBox.Right <= p[1])); var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e)); var newNodes = newLeafs.Select(l => HorizontalCut(l, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); var lost = leaf.Words.Except(newLeafsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); if (lost.Count > 0) { newNodes.AddRange(lost.Select(w => new XYLeaf(w))); } return(new XYNode(newNodes)); }
private XYNode HorizontalCut(XYLeaf leaf, double minimumWidth, Func <IEnumerable <double>, double> dominantFontWidthFunc, Func <IEnumerable <double>, double> dominantFontHeightFunc, int level = 0) { var words = leaf.Words.Where(w => !string.IsNullOrWhiteSpace(w.Text)).OrderBy(w => w.BoundingBox.Bottom).ToArray(); // order bottom to top if (!words.Any()) { return(new XYNode(null)); } //Create new leaf with non-whitespace words. leaf = new XYLeaf(words); if (leaf.CountWords() <= 1) { // we stop cutting if // - only one word remains return(leaf); } // determine dominantFontWidth and dominantFontHeight double domFontWidth = dominantFontWidthFunc(words.SelectMany(x => x.Letters) .Select(x => Math.Abs(x.GlyphRectangle.Width))); double domFontHeight = dominantFontHeightFunc(words.SelectMany(x => x.Letters) .Select(x => Math.Abs(x.GlyphRectangle.Height))); List <double[]> projectionProfile = new List <double[]>(); double[] currentProj = new double[2] { words[0].BoundingBox.Bottom, words[0].BoundingBox.Top }; int wordsCount = words.Count(); for (int i = 1; i < wordsCount; i++) { if ((words[i].BoundingBox.Bottom >= currentProj[0] && words[i].BoundingBox.Bottom <= currentProj[1]) || (words[i].BoundingBox.Top >= currentProj[0] && words[i].BoundingBox.Top <= currentProj[1])) { // it is overlapping if (words[i].BoundingBox.Bottom >= currentProj[0] && words[i].BoundingBox.Bottom <= currentProj[1] && words[i].BoundingBox.Top > currentProj[1]) { currentProj[1] = words[i].BoundingBox.Top; } } else { // no overlap if (words[i].BoundingBox.Bottom - currentProj[1] <= domFontHeight) { // if gap too small -> don't cut // |____| |____| currentProj[1] = words[i].BoundingBox.Top; } else { // if gap big enough -> cut! // |____| | |____| if (i != wordsCount - 1) // will always add the last one after { projectionProfile.Add(currentProj); currentProj = new double[2] { words[i].BoundingBox.Bottom, words[i].BoundingBox.Top }; } } } if (i == wordsCount - 1) { projectionProfile.Add(currentProj); } } if (projectionProfile.Count == 1) { if (level >= 1) { return(leaf); } else { level++; } } var newLeafsEnums = projectionProfile.Select(p => leaf.Words.Where(w => w.BoundingBox.Bottom >= p[0] && w.BoundingBox.Top <= p[1])); var newLeafs = newLeafsEnums.Where(e => e.Count() > 0).Select(e => new XYLeaf(e)); var newNodes = newLeafs.Select(l => VerticalCut(l, minimumWidth, dominantFontWidthFunc, dominantFontHeightFunc, level)).ToList(); var lost = leaf.Words.Except(newLeafsEnums.SelectMany(x => x)).Where(x => !string.IsNullOrWhiteSpace(x.Text)).ToList(); if (lost.Count > 0) { newNodes.AddRange(lost.Select(w => new XYLeaf(w))); } return(new XYNode(newNodes)); }