void DetectRowBounds(PageLayout cbi, Blob[] blobs) { if (blobs.Length == 0) { return; } if (cbi.Bounds == Rectangle.Empty) { return; } List<LayoutElement> rows = new List<LayoutElement>(); LayoutElement currentRow = null; // Attempt drawing lines between the rows. for (int y = cbi.Bounds.Top; y < cbi.Bounds.Bottom; y++) { Rectangle rowRect = new Rectangle(cbi.Bounds.Left, y, cbi.Bounds.Width, 1); var blobsInRow = blobs.Where(b => b.Rectangle.IntersectsWith(rowRect)); if (blobsInRow.FirstOrDefault() == null) { // Empty row detected. Commit current row (if any) TryAddRow(rows, currentRow); currentRow = null; } else { // Start new row if needed if (currentRow == null) { currentRow = new LayoutElement(); currentRow.Type = LayoutElementType.Row; } currentRow.Children.AddRange(blobsInRow.Select(x => LayoutElement.NewWord(cbi.PageSize, x.Rectangle))); // Advance to test the next empty space // TODO: beware of off-by-1 //y = currentRow.Bounds.Bottom - 1; } } // Add row at the end TryAddRow(rows, currentRow); FindAndRemoveHeaderAndFooter(cbi, rows); cbi.Children = rows; cbi.SetBoundsFromNodes(true); }
PageLayout DetectLayout(DW<PDFWrapper> doc, PDFPage page) { Size pageSize = new Size(doc.o.PageWidth, doc.o.PageHeight); PageLayout layout = new PageLayout(pageSize); // Get text // TODO: check how text is split in multicolumn case -- is this the method with correct options (flow, not physical) layout.Text = page.Text; var words = new List<LayoutElement>(); var nonEmptyWords = page.WordList .Where(x => !x.Bounds.IsEmpty && !x.Word.IsEmpty()) .Select(x => LayoutElement.NewWord(pageSize, x.Bounds, x.Word)); words.AddRange(nonEmptyWords); // Detect rows and columns var rows = words.Split(StartsNewRow).Select(ws => LayoutElement.NewRow(ws, LayoutElementType.Row)); var cols = rows.Split(StartsNewColumn).Select(rs => LayoutElement.NewRow(rs, LayoutElementType.Column)); // TODO: detect header/footer layout.Children.AddRange(cols); // Strange bug -- if doing the following, first word is missing and last word is blank. // However, with LINQ query above it's fine //List<PDFTextWord> ws = new List<PDFTextWord>(); //ws.AddRange(page.WordList); if (layout.Children.Count > 0) { layout.SetBoundsFromNodes(true); /* // expand by width a bit (to prevent cutting off words which // may not be recognized properly. int expandWidth = (0.05 * layout.Bounds.Width).Round(); RectangleF expBounds = layout.UnitBounds; expBounds.X -= expandWidth / 2; expBounds.Width += expandWidth; if (expBounds.X <= 1 && expBounds.Width <= 1) { layout.UnitBounds = expBounds; } */ } // error checking if (layout.Bounds.X < 0 || layout.Bounds.Y < 0 || layout.Bounds.Width <= 0 || layout.Bounds.Height <= 0) { logger.Error("Wrong bounds: " + layout.Bounds + " images: " + page.ImagesCount); float height = page.ImagesCount > 0 ? 1 : 0.1f; layout.UnitBounds = new RectangleF(0,0, 1, height); } // TODO: detect rows // TODO: detect header/footer (if any) return layout; }