Пример #1
0
        public PageOnScreen(int pageNum, PageLayout layout)
        {
            ArgCheck.GreaterThanOrEqual(pageNum, 1, "pageNum");

            PageNum = pageNum;
            Layout = layout;
        }
 public PageLayout DetectLayoutFromBook(IBookContent book, int pageNum)
 {
     // This is hacky, but OK for now
     Bitmap b = book.BookProvider.o.RenderPageImage(pageNum, Size.Empty);
     PageLayout pli = new PageLayout(b.Size);
     b.Dispose();
     return pli;
 }
        public PageLayout DetectLayoutFromImage(Bitmap bmp)
        {
            ArgCheck.NotNull(bmp, "bmp");

            PageLayout layout = new PageLayout(bmp.Size);

            Blob[] blobs = DetectBlobs(bmp);

            DetectRowBounds(layout, blobs);

            return layout;
        }
        void DetectRowBounds(PageLayout cbi, Blob[] blobs)
        {
            if (blobs.Length == 0) { return; }
            if (cbi.Bounds == Rectangle.Empty) { return; }

            List<LayoutElement> rows = new List<LayoutElement>();

            LayoutElement currentRow = null;
            // Attempt drawing lines between the rows.
            for (int y = cbi.Bounds.Top; y < cbi.Bounds.Bottom; y++)
            {
                Rectangle rowRect = new Rectangle(cbi.Bounds.Left, y, cbi.Bounds.Width, 1);

                var blobsInRow = blobs.Where(b => b.Rectangle.IntersectsWith(rowRect));

                if (blobsInRow.FirstOrDefault() == null)
                {
                    // Empty row detected. Commit current row (if any)
                    TryAddRow(rows, currentRow);
                    currentRow = null;
                }
                else
                {
                    // Start new row if needed
                    if (currentRow == null)
                    {
                        currentRow = new LayoutElement();
                        currentRow.Type = LayoutElementType.Row;
                    }
                    currentRow.Children.AddRange(blobsInRow.Select(x => LayoutElement.NewWord(cbi.PageSize, x.Rectangle)));

                    // Advance to test the next empty space
                    // TODO: beware of off-by-1
                    //y = currentRow.Bounds.Bottom - 1;
                }
            }

            // Add row at the end
            TryAddRow(rows, currentRow);

            FindAndRemoveHeaderAndFooter(cbi, rows);

            cbi.Children = rows;
            cbi.SetBoundsFromNodes(true);
        }
 public PageLayout DetectLayoutFromImage(Bitmap physicalPage)
 {
     PageLayout pli = new PageLayout(physicalPage.Size);
     return pli;
 }
        void FindAndRemoveHeaderAndFooter(PageLayout cbi, List<LayoutElement> rows)
        {
            // KEY HEURISTIC: do most OTHER pages have headers and footers.
            // Difficult to implement at this level, but ought to be reliable.

            // PROBLEM section heading sometimes recognized as header.
            // Height filtering could fix this in theory, but it's bad in other cases

            // FALSE POSITIVES are terrible (worse than missing a header/footer)
            // out006 out038 out044 0ut has a false positive footer of the last line. Heuristic? 
            // Left aligned? By itself, left alignment does not disqualify it

            // Maybe this is a learning problem -- extract features, make probabilistic
            // analysis. Need training data -- set of page pictures labeled with HasHeader/HasFooter

            // Minimum number of rows on a sensible page
            if (rows.Count < 2) { return; }

            int lastIdx = rows.Count - 1;

            LayoutElement header = null;
            LayoutElement footer = null;

            // Exception with small numbers (e.g. 2 elements, upper one much smaller => footer
            if (rows.Count <= 3)
            {
                // Check header
                if (rows[0].UnitBounds.Height < rows[1].UnitBounds.Height / 2)
                {
                    header = rows[0];
                }

                // Check footer
                if (rows[lastIdx].UnitBounds.Height < rows[lastIdx - 1].UnitBounds.Height / 2)
                {
                    footer = rows[lastIdx];
                }

                return;
            }

            float distanceSum = 0;
            for (int i = 1; i < rows.Count; i++)
            {
                distanceSum += DistanceAboveRow(i, rows);
            }
            float distanceAvg = (float)distanceSum / (rows.Count - 1);
            float minDistance = distanceAvg * 1.2f;

            float heightAvg = rows.Average(r => (float)r.UnitBounds.Height);
            float maxHeight = heightAvg * 1.5f;

            // Header
            float headerHeight = rows[0].UnitBounds.Height;
            float headerDistance = DistanceAboveRow(1, rows);

            if (headerDistance > minDistance &&
                headerHeight < maxHeight)
            {
                header = rows[0];
            }

            // Footer
            float footerHeight = rows[lastIdx].UnitBounds.Height;
            float footerDistance = DistanceAboveRow(lastIdx, rows);
            if (footerDistance > minDistance &&
                footerHeight < maxHeight)
            {
                footer = rows[lastIdx];
            }

            // Note: width heuristic is wrong -- header can be wide

            // Remove header and footer from rows, recompute main content bounds
            if (header != null) { rows.Remove(header); }
            if (footer != null) { rows.Remove(footer); }
        }
        PageLayout DetectLayout(DW<PDFWrapper> doc, PDFPage page)
        {
            Size pageSize = new Size(doc.o.PageWidth, doc.o.PageHeight);
            PageLayout layout = new PageLayout(pageSize);

            // Get text
            // TODO: check how text is split in multicolumn case -- is this the method with correct options (flow, not physical)
            layout.Text = page.Text;

            var words = new List<LayoutElement>();
            var nonEmptyWords = page.WordList
                .Where(x => !x.Bounds.IsEmpty && !x.Word.IsEmpty())
                .Select(x => LayoutElement.NewWord(pageSize, x.Bounds, x.Word));

            words.AddRange(nonEmptyWords);

            // Detect rows and columns
            var rows = words.Split(StartsNewRow).Select(ws => LayoutElement.NewRow(ws, LayoutElementType.Row));
            var cols = rows.Split(StartsNewColumn).Select(rs => LayoutElement.NewRow(rs, LayoutElementType.Column));

            // TODO: detect header/footer
            layout.Children.AddRange(cols);

            // Strange bug -- if doing the following, first word is missing and last word is blank.
            // However, with LINQ query above it's fine

            //List<PDFTextWord> ws = new List<PDFTextWord>();
            //ws.AddRange(page.WordList);

            if (layout.Children.Count > 0)
            {
                layout.SetBoundsFromNodes(true);

                /*
                // expand by width a bit (to prevent cutting off words which
                // may not be recognized properly.
                int expandWidth = (0.05 * layout.Bounds.Width).Round();

                RectangleF expBounds = layout.UnitBounds;
                expBounds.X -= expandWidth / 2;
                expBounds.Width += expandWidth;

                if (expBounds.X <= 1 && expBounds.Width <= 1)
                {
                    layout.UnitBounds = expBounds;
                }
                 */
            }

            // error checking
            if (layout.Bounds.X < 0 || layout.Bounds.Y < 0 ||
                layout.Bounds.Width <= 0 || layout.Bounds.Height <= 0)
            {
                logger.Error("Wrong bounds: " + layout.Bounds + " images: " + page.ImagesCount);

                float height = page.ImagesCount > 0 ? 1 : 0.1f;

                layout.UnitBounds = new RectangleF(0,0, 1, height);
            }

            // TODO: detect rows

            // TODO: detect header/footer (if any)

            return layout;
        }
        internal TestCaseStatus GetStatus(PageLayout layout)
        {
            if (ExpectedLayout == null) { return TestCaseStatus.Unknown; }

            // Shallow comparison, just content bounds for now
            if (ExpectedLayout.UnitBounds.AlmostEquals(layout.UnitBounds, 0.003f)) { return TestCaseStatus.Pass_Good; }

            return TestCaseStatus.Fail;
        }