public PageOnScreen(int pageNum, PageLayout layout) { ArgCheck.GreaterThanOrEqual(pageNum, 1, "pageNum"); PageNum = pageNum; Layout = layout; }
public PageLayout DetectLayoutFromBook(IBookContent book, int pageNum) { // This is hacky, but OK for now Bitmap b = book.BookProvider.o.RenderPageImage(pageNum, Size.Empty); PageLayout pli = new PageLayout(b.Size); b.Dispose(); return pli; }
public PageLayout DetectLayoutFromImage(Bitmap bmp) { ArgCheck.NotNull(bmp, "bmp"); PageLayout layout = new PageLayout(bmp.Size); Blob[] blobs = DetectBlobs(bmp); DetectRowBounds(layout, blobs); return layout; }
void DetectRowBounds(PageLayout cbi, Blob[] blobs) { if (blobs.Length == 0) { return; } if (cbi.Bounds == Rectangle.Empty) { return; } List<LayoutElement> rows = new List<LayoutElement>(); LayoutElement currentRow = null; // Attempt drawing lines between the rows. for (int y = cbi.Bounds.Top; y < cbi.Bounds.Bottom; y++) { Rectangle rowRect = new Rectangle(cbi.Bounds.Left, y, cbi.Bounds.Width, 1); var blobsInRow = blobs.Where(b => b.Rectangle.IntersectsWith(rowRect)); if (blobsInRow.FirstOrDefault() == null) { // Empty row detected. Commit current row (if any) TryAddRow(rows, currentRow); currentRow = null; } else { // Start new row if needed if (currentRow == null) { currentRow = new LayoutElement(); currentRow.Type = LayoutElementType.Row; } currentRow.Children.AddRange(blobsInRow.Select(x => LayoutElement.NewWord(cbi.PageSize, x.Rectangle))); // Advance to test the next empty space // TODO: beware of off-by-1 //y = currentRow.Bounds.Bottom - 1; } } // Add row at the end TryAddRow(rows, currentRow); FindAndRemoveHeaderAndFooter(cbi, rows); cbi.Children = rows; cbi.SetBoundsFromNodes(true); }
public PageLayout DetectLayoutFromImage(Bitmap physicalPage) { PageLayout pli = new PageLayout(physicalPage.Size); return pli; }
void FindAndRemoveHeaderAndFooter(PageLayout cbi, List<LayoutElement> rows) { // KEY HEURISTIC: do most OTHER pages have headers and footers. // Difficult to implement at this level, but ought to be reliable. // PROBLEM section heading sometimes recognized as header. // Height filtering could fix this in theory, but it's bad in other cases // FALSE POSITIVES are terrible (worse than missing a header/footer) // out006 out038 out044 0ut has a false positive footer of the last line. Heuristic? // Left aligned? By itself, left alignment does not disqualify it // Maybe this is a learning problem -- extract features, make probabilistic // analysis. Need training data -- set of page pictures labeled with HasHeader/HasFooter // Minimum number of rows on a sensible page if (rows.Count < 2) { return; } int lastIdx = rows.Count - 1; LayoutElement header = null; LayoutElement footer = null; // Exception with small numbers (e.g. 2 elements, upper one much smaller => footer if (rows.Count <= 3) { // Check header if (rows[0].UnitBounds.Height < rows[1].UnitBounds.Height / 2) { header = rows[0]; } // Check footer if (rows[lastIdx].UnitBounds.Height < rows[lastIdx - 1].UnitBounds.Height / 2) { footer = rows[lastIdx]; } return; } float distanceSum = 0; for (int i = 1; i < rows.Count; i++) { distanceSum += DistanceAboveRow(i, rows); } float distanceAvg = (float)distanceSum / (rows.Count - 1); float minDistance = distanceAvg * 1.2f; float heightAvg = rows.Average(r => (float)r.UnitBounds.Height); float maxHeight = heightAvg * 1.5f; // Header float headerHeight = rows[0].UnitBounds.Height; float headerDistance = DistanceAboveRow(1, rows); if (headerDistance > minDistance && headerHeight < maxHeight) { header = rows[0]; } // Footer float footerHeight = rows[lastIdx].UnitBounds.Height; float footerDistance = DistanceAboveRow(lastIdx, rows); if (footerDistance > minDistance && footerHeight < maxHeight) { footer = rows[lastIdx]; } // Note: width heuristic is wrong -- header can be wide // Remove header and footer from rows, recompute main content bounds if (header != null) { rows.Remove(header); } if (footer != null) { rows.Remove(footer); } }
PageLayout DetectLayout(DW<PDFWrapper> doc, PDFPage page) { Size pageSize = new Size(doc.o.PageWidth, doc.o.PageHeight); PageLayout layout = new PageLayout(pageSize); // Get text // TODO: check how text is split in multicolumn case -- is this the method with correct options (flow, not physical) layout.Text = page.Text; var words = new List<LayoutElement>(); var nonEmptyWords = page.WordList .Where(x => !x.Bounds.IsEmpty && !x.Word.IsEmpty()) .Select(x => LayoutElement.NewWord(pageSize, x.Bounds, x.Word)); words.AddRange(nonEmptyWords); // Detect rows and columns var rows = words.Split(StartsNewRow).Select(ws => LayoutElement.NewRow(ws, LayoutElementType.Row)); var cols = rows.Split(StartsNewColumn).Select(rs => LayoutElement.NewRow(rs, LayoutElementType.Column)); // TODO: detect header/footer layout.Children.AddRange(cols); // Strange bug -- if doing the following, first word is missing and last word is blank. // However, with LINQ query above it's fine //List<PDFTextWord> ws = new List<PDFTextWord>(); //ws.AddRange(page.WordList); if (layout.Children.Count > 0) { layout.SetBoundsFromNodes(true); /* // expand by width a bit (to prevent cutting off words which // may not be recognized properly. int expandWidth = (0.05 * layout.Bounds.Width).Round(); RectangleF expBounds = layout.UnitBounds; expBounds.X -= expandWidth / 2; expBounds.Width += expandWidth; if (expBounds.X <= 1 && expBounds.Width <= 1) { layout.UnitBounds = expBounds; } */ } // error checking if (layout.Bounds.X < 0 || layout.Bounds.Y < 0 || layout.Bounds.Width <= 0 || layout.Bounds.Height <= 0) { logger.Error("Wrong bounds: " + layout.Bounds + " images: " + page.ImagesCount); float height = page.ImagesCount > 0 ? 1 : 0.1f; layout.UnitBounds = new RectangleF(0,0, 1, height); } // TODO: detect rows // TODO: detect header/footer (if any) return layout; }
internal TestCaseStatus GetStatus(PageLayout layout) { if (ExpectedLayout == null) { return TestCaseStatus.Unknown; } // Shallow comparison, just content bounds for now if (ExpectedLayout.UnitBounds.AlmostEquals(layout.UnitBounds, 0.003f)) { return TestCaseStatus.Pass_Good; } return TestCaseStatus.Fail; }