Ejemplo n.º 1
0
        public void TestShouldDetectRulings()
        {
            using (PdfDocument pdf_document = PdfDocument.Open("Resources/should_detect_rulings.pdf", new ParsingOptions()
            {
                ClipPaths = true
            }))
            {
                ObjectExtractor oe = new ObjectExtractor(pdf_document);
                PageIterator    pi = oe.Extract();

                PageArea page = pi.Next();
                IReadOnlyList <Ruling> rulings = page.GetRulings();

                foreach (Ruling r in rulings)
                {
                    Assert.True(page.BoundingBox.Contains(r.Line.GetBoundingRectangle(), true));
                }
            }
        }
Ejemplo n.º 2
0
 /// <summary>
 /// Extracts the tables in the page.
 /// </summary>
 /// <param name="page">The page where to extract the tables.</param>
 public List <Table> Extract(PageArea page)
 {
     return(Extract(page, page.GetRulings()));
 }
Ejemplo n.º 3
0
        /// <summary>
        /// Detects the tables in the page.
        /// </summary>
        /// <param name="page"></param>
        public List <TableRectangle> Detect(PageArea page)
        {
            // get horizontal & vertical lines
            // we get these from an image of the PDF and not the PDF itself because sometimes there are invisible PDF
            // instructions that are interpreted incorrectly as visible elements - we really want to capture what a
            // person sees when they look at the PDF
            // BobLd: hack here, we don't convert to an image
            var           pageRulings       = page.GetRulings();
            List <Ruling> horizontalRulings = this.getHorizontalRulings(pageRulings);
            List <Ruling> verticalRulings   = this.getVerticalRulings(pageRulings);
            // end hack here

            List <Ruling> allEdges = new List <Ruling>(horizontalRulings);

            allEdges.AddRange(verticalRulings);

            List <TableRectangle> tableAreas = new List <TableRectangle>();

            // if we found some edges, try to find some tables based on them
            if (allEdges.Count > 0)
            {
                // now we need to snap edge endpoints to a grid
                Utils.SnapPoints(allEdges, POINT_SNAP_DISTANCE_THRESHOLD, POINT_SNAP_DISTANCE_THRESHOLD);

                // normalize the rulings to make sure snapping didn't create any wacky non-horizontal/vertical rulings
                foreach (List <Ruling> rulings in new[] { horizontalRulings, verticalRulings }) //Arrays.asList(horizontalRulings, verticalRulings))
                {
                    //for (Iterator<Ruling> iterator = rulings.iterator(); iterator.hasNext();)
                    foreach (var ruling in rulings.ToList()) // use ToList to be able to remove
                    {
                        ruling.Normalize();
                        if (ruling.IsOblique)
                        {
                            rulings.Remove(ruling);
                        }
                    }
                }

                // merge the edge lines into rulings - this makes finding edges between crossing points in the next step easier
                // we use a larger pixel expansion than the normal spreadsheet extraction method to cover gaps in the
                // edge detection/pixel snapping steps
                horizontalRulings = Ruling.CollapseOrientedRulings(horizontalRulings, 5);
                verticalRulings   = Ruling.CollapseOrientedRulings(verticalRulings, 5);

                // use the rulings and points to find cells
                List <TableRectangle> cells = SpreadsheetExtractionAlgorithm.FindCells(horizontalRulings, verticalRulings).Cast <TableRectangle>().ToList();

                // then use those cells to make table areas
                tableAreas = getTableAreasFromCells(cells);
            }

            // next find any vertical rulings that intersect tables - sometimes these won't have completely been captured as
            // cells if there are missing horizontal lines (which there often are)
            // let's assume though that these lines should be part of the table
            foreach (Ruling verticalRuling in verticalRulings) // Line2D.Float
            {
                foreach (TableRectangle tableArea in tableAreas)
                {
                    if (verticalRuling.Intersects(tableArea) &&
                        !(tableArea.Contains(verticalRuling.P1) && tableArea.Contains(verticalRuling.P2)))
                    {
                        tableArea.SetTop(Math.Ceiling(Math.Max(tableArea.Top, verticalRuling.Y2)));     // bobld: Floor and Min, Y1
                        tableArea.SetBottom(Math.Floor(Math.Min(tableArea.Bottom, verticalRuling.Y1))); // bobld: Ceiling and Max, Y2
                        break;
                    }
                }
            }

            /* BobLd: not sure this is the case in tabula-sharp/PdfPig
             * // the tabula Page coordinate space is half the size of the PDFBox image coordinate space
             * // so halve the table area size before proceeding and add a bit of padding to make sure we capture everything
             * foreach (TableRectangle area in tableAreas)
             * {
             *  area.x = (float)Math.floor(area.x / 2) - TABLE_PADDING_AMOUNT;
             *  area.y = (float)Math.floor(area.y / 2) - TABLE_PADDING_AMOUNT;
             *  area.width = (float)Math.ceil(area.width / 2) + TABLE_PADDING_AMOUNT;
             *  area.height = (float)Math.ceil(area.height / 2) + TABLE_PADDING_AMOUNT;
             * }
             *
             * // we're going to want halved horizontal lines later too
             * foreach (Ruling ruling in horizontalRulings) // Line2D.Float
             * {
             *  ruling.x1 = ruling.x1 / 2;
             *  ruling.y1 = ruling.y1 / 2;
             *  ruling.x2 = ruling.x2 / 2;
             *  ruling.y2 = ruling.y2 / 2;
             * }
             */

            // now look at text rows to help us find more tables and flesh out existing ones
            List <TextChunk> textChunks = TextElement.MergeWords(page.GetText());
            List <TableLine> lines      = TextChunk.GroupByLines(textChunks);

            // first look for text rows that intersect an existing table - those lines should probably be part of the table
            foreach (TableLine textRow in lines)
            {
                foreach (TableRectangle tableArea in tableAreas)
                {
                    if (!tableArea.Contains(textRow) && textRow.Intersects(tableArea))
                    {
                        tableArea.SetLeft(Math.Floor(Math.Min(textRow.Left, tableArea.Left)));
                        tableArea.SetRight(Math.Ceiling(Math.Max(textRow.Right, tableArea.Right)));
                    }
                }
            }

            // get rid of tables that DO NOT intersect any text areas - these are likely graphs or some sort of graphic
            //for (Iterator<Rectangle> iterator = tableAreas.iterator(); iterator.hasNext();)
            foreach (TableRectangle table in tableAreas.ToList()) // use tolist to be able to remove
            {
                bool intersectsText = false;
                foreach (TableLine textRow in lines)
                {
                    if (table.Intersects(textRow))
                    {
                        intersectsText = true;
                        break;
                    }
                }

                if (!intersectsText)
                {
                    tableAreas.Remove(table);
                }
            }

            // lastly, there may be some tables that don't have any vertical rulings at all
            // we'll use text edges we've found to try and guess which text rows are part of a table

            // in his thesis nurminen goes through every row to try to assign a probability that the line is in a table
            // we're going to try a general heuristic instead, trying to find what type of edge (left/right/mid) intersects
            // the most text rows, and then use that magic number of "relevant" edges to decide what text rows should be
            // part of a table.

            bool foundTable;

            do
            {
                foundTable = false;

                // get rid of any text lines contained within existing tables, this allows us to find more tables
                //for (Iterator<TableLine> iterator = lines.iterator(); iterator.hasNext();)
                foreach (var textRow in lines.ToList())
                {
                    foreach (TableRectangle table in tableAreas)
                    {
                        if (table.Contains(textRow))
                        {
                            lines.Remove(textRow);
                            break;
                        }
                    }
                }

                // get text edges from remaining lines in the document
                TextEdges textEdges = getTextEdges(lines);
                //List<TextEdge> leftTextEdges = textEdges[TextEdge.LEFT];
                //List<TextEdge> midTextEdges = textEdges[TextEdge.MID];
                //List<TextEdge> rightTextEdges = textEdges[TextEdge.RIGHT];

                // find the relevant text edges (the ones we think define where a table is)
                RelevantEdges relevantEdgeInfo = getRelevantEdges(textEdges, lines);

                // we found something relevant so let's look for rows that fit our criteria
                if (relevantEdgeInfo.edgeType != -1)
                {
                    List <TextEdge> relevantEdges = null;
                    switch (relevantEdgeInfo.edgeType)
                    {
                    case TextEdge.LEFT:
                        relevantEdges = textEdges[TextEdge.LEFT];       // leftTextEdges;
                        break;

                    case TextEdge.MID:
                        relevantEdges = textEdges[TextEdge.MID];        // midTextEdges;
                        break;

                    case TextEdge.RIGHT:
                        relevantEdges = textEdges[TextEdge.RIGHT];      // rightTextEdges;
                        break;
                    }

                    TableRectangle table = getTableFromText(lines, relevantEdges, relevantEdgeInfo.edgeCount, horizontalRulings);

                    if (table != null)
                    {
                        foundTable = true;
                        tableAreas.Add(table);
                    }
                }
            } while (foundTable);

            // create a set of our current tables that will eliminate duplicate tables
            SortedSet <TableRectangle> tableSet = new SortedSet <TableRectangle>(new TreeSetComparer()); //Set<Rectangle> tableSet = new TreeSet<>(new Comparator<Rectangle>() {...

            foreach (var table in tableAreas.OrderByDescending(t => t.Area))
            {
                tableSet.Add(table);
            }

            return(tableSet.ToList());
        }