Пример #1
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="page"></param>
        public bool IsTabular(PageArea page)
        {
            // if there's no text at all on the page, it's not a table
            // (we won't be able to do anything with it though)
            if (page.GetText().Count == 0)
            {
                return(false);
            }

            // get minimal region of page that contains every character (in effect,
            // removes white "margins")
            PageArea minimalRegion = page.GetArea(Utils.Bounds(page.GetText().Select(t => t.BoundingBox).ToList()));

            List <Table> tables = new SpreadsheetExtractionAlgorithm().Extract(minimalRegion);

            if (tables.Count == 0)
            {
                return(false);
            }

            Table table = tables[0];
            int   rowsDefinedByLines = table.RowCount;
            int   colsDefinedByLines = table.ColumnCount;

            tables = new BasicExtractionAlgorithm().Extract(minimalRegion);
            if (tables.Count == 0)
            {
                // TODO WHAT DO WE DO HERE?
                System.Diagnostics.Debug.Write("SpreadsheetExtractionAlgorithm.isTabular(): no table found.");
            }

            table = tables[0];
            int rowsDefinedWithoutLines = table.RowCount;
            int colsDefinedWithoutLines = table.ColumnCount;

            float ratio = (((float)colsDefinedByLines / colsDefinedWithoutLines) + ((float)rowsDefinedByLines / rowsDefinedWithoutLines)) / 2.0f;

            return(ratio > MAGIC_HEURISTIC_NUMBER && ratio < (1 / MAGIC_HEURISTIC_NUMBER));
        }
Пример #2
0
        public void TestTextElementsContainedInPage()
        {
            using (PdfDocument pdf_document = PdfDocument.Open("Resources/cs-en-us-pbms.pdf", new ParsingOptions()
            {
                ClipPaths = true
            }))
            {
                ObjectExtractor oe = new ObjectExtractor(pdf_document);

                PageArea page = oe.ExtractPage(1);

                foreach (TextElement te in page.GetText())
                {
                    Assert.True(page.BoundingBox.Contains(te.BoundingBox));
                }
            }
        }
Пример #3
0
        /// <summary>
        /// Extracts the tables in the page.
        /// </summary>
        /// <param name="page">The page where to extract the tables.</param>
        public List <Table> Extract(PageArea page)
        {
            List <TextElement> textElements = page.GetText();

            if (textElements.Count == 0)
            {
                return(new Table[] { Table.EMPTY }.ToList());
            }

            List <TextChunk> textChunks = this.verticalRulings == null?TextElement.MergeWords(page.GetText()) : TextElement.MergeWords(page.GetText(), this.verticalRulings);

            List <TableLine> lines = TextChunk.GroupByLines(textChunks);

            List <double> columns;

            if (this.verticalRulings != null)
            {
                // added by bobld: clipping verticalRulings because testExtractColumnsCorrectly2() fails
                var clippedVerticalRulings = Ruling.CropRulingsToArea(this.verticalRulings, page.BoundingBox);
                clippedVerticalRulings.Sort(new VerticalRulingComparer());
                columns = new List <double>(clippedVerticalRulings.Count);
                foreach (Ruling vr in clippedVerticalRulings)
                {
                    columns.Add(vr.Left);
                }

                /*
                 * this.verticalRulings.Sort(new VerticalRulingComparer());
                 * columns = new List<double>(this.verticalRulings.Count);
                 * foreach (Ruling vr in this.verticalRulings)
                 * {
                 *  columns.Add(vr.getLeft());
                 * }
                 */
            }
            else
            {
                columns = ColumnPositions(lines);
            }

            // added by bobld: remove duplicates because testExtractColumnsCorrectly2() fails,
            // why do we need it here and not in the java version??
            columns = columns.Distinct().ToList();

            Table table = new Table(this);

            table.SetRect(page.BoundingBox);

            for (int i = 0; i < lines.Count; i++)
            {
                TableLine        line     = lines[i];
                List <TextChunk> elements = line.TextElements.ToList();

                elements.Sort(new TextChunkComparer());

                foreach (TextChunk tc in elements)
                {
                    if (tc.IsSameChar(TableLine.WHITE_SPACE_CHARS))
                    {
                        continue;
                    }

                    int  j     = 0;
                    bool found = false;
                    for (; j < columns.Count; j++)
                    {
                        if (tc.Left <= columns[j])
                        {
                            found = true;
                            break;
                        }
                    }

                    table.Add(new Cell(tc), i, found ? j : columns.Count);
                }
            }

            return(new Table[] { table }.ToList());
        }
Пример #4
0
        /// <summary>
        /// Extracts the tables in the page using rulings as separators.
        /// </summary>
        /// <param name="page"></param>
        /// <param name="rulings"></param>
        public List <Table> Extract(PageArea page, IReadOnlyList <Ruling> rulings)
        {
            // split rulings into horizontal and vertical
            List <Ruling> horizontalR = new List <Ruling>();
            List <Ruling> verticalR   = new List <Ruling>();

            foreach (Ruling r in rulings)
            {
                if (r.IsHorizontal)
                {
                    horizontalR.Add(r);
                }
                else if (r.IsVertical)
                {
                    verticalR.Add(r);
                }
            }

            horizontalR = Ruling.CollapseOrientedRulings(horizontalR);
            verticalR   = Ruling.CollapseOrientedRulings(verticalR);

            List <Cell>           cells            = FindCells(horizontalR, verticalR);
            List <TableRectangle> spreadsheetAreas = FindSpreadsheetsFromCells(cells.Cast <TableRectangle>().ToList());

            List <Table> spreadsheets = new List <Table>();

            foreach (TableRectangle area in spreadsheetAreas)
            {
                List <Cell> overlappingCells = new List <Cell>();
                foreach (Cell c in cells)
                {
                    if (c.Intersects(area))
                    {
                        c.SetTextElements(TextElement.MergeWords(page.GetText(c.BoundingBox)));
                        overlappingCells.Add(c);
                    }
                }

                List <Ruling> horizontalOverlappingRulings = new List <Ruling>();
                foreach (Ruling hr in horizontalR)
                {
                    if (area.IntersectsLine(hr))
                    {
                        horizontalOverlappingRulings.Add(hr);
                    }
                }

                List <Ruling> verticalOverlappingRulings = new List <Ruling>();
                foreach (Ruling vr in verticalR)
                {
                    if (area.IntersectsLine(vr))
                    {
                        verticalOverlappingRulings.Add(vr);
                    }
                }

                TableWithRulingLines t = new TableWithRulingLines(area, overlappingCells, horizontalOverlappingRulings, verticalOverlappingRulings, this);
                spreadsheets.Add(t);
            }

            Utils.Sort(spreadsheets, new TableRectangle.ILL_DEFINED_ORDER());
            return(spreadsheets);
        }
Пример #5
0
        /// <summary>
        /// Detects the tables in the page.
        /// </summary>
        /// <param name="page"></param>
        public List <TableRectangle> Detect(PageArea page)
        {
            // get horizontal & vertical lines
            // we get these from an image of the PDF and not the PDF itself because sometimes there are invisible PDF
            // instructions that are interpreted incorrectly as visible elements - we really want to capture what a
            // person sees when they look at the PDF
            // BobLd: hack here, we don't convert to an image
            var           pageRulings       = page.GetRulings();
            List <Ruling> horizontalRulings = this.getHorizontalRulings(pageRulings);
            List <Ruling> verticalRulings   = this.getVerticalRulings(pageRulings);
            // end hack here

            List <Ruling> allEdges = new List <Ruling>(horizontalRulings);

            allEdges.AddRange(verticalRulings);

            List <TableRectangle> tableAreas = new List <TableRectangle>();

            // if we found some edges, try to find some tables based on them
            if (allEdges.Count > 0)
            {
                // now we need to snap edge endpoints to a grid
                Utils.SnapPoints(allEdges, POINT_SNAP_DISTANCE_THRESHOLD, POINT_SNAP_DISTANCE_THRESHOLD);

                // normalize the rulings to make sure snapping didn't create any wacky non-horizontal/vertical rulings
                foreach (List <Ruling> rulings in new[] { horizontalRulings, verticalRulings }) //Arrays.asList(horizontalRulings, verticalRulings))
                {
                    //for (Iterator<Ruling> iterator = rulings.iterator(); iterator.hasNext();)
                    foreach (var ruling in rulings.ToList()) // use ToList to be able to remove
                    {
                        ruling.Normalize();
                        if (ruling.IsOblique)
                        {
                            rulings.Remove(ruling);
                        }
                    }
                }

                // merge the edge lines into rulings - this makes finding edges between crossing points in the next step easier
                // we use a larger pixel expansion than the normal spreadsheet extraction method to cover gaps in the
                // edge detection/pixel snapping steps
                horizontalRulings = Ruling.CollapseOrientedRulings(horizontalRulings, 5);
                verticalRulings   = Ruling.CollapseOrientedRulings(verticalRulings, 5);

                // use the rulings and points to find cells
                List <TableRectangle> cells = SpreadsheetExtractionAlgorithm.FindCells(horizontalRulings, verticalRulings).Cast <TableRectangle>().ToList();

                // then use those cells to make table areas
                tableAreas = getTableAreasFromCells(cells);
            }

            // next find any vertical rulings that intersect tables - sometimes these won't have completely been captured as
            // cells if there are missing horizontal lines (which there often are)
            // let's assume though that these lines should be part of the table
            foreach (Ruling verticalRuling in verticalRulings) // Line2D.Float
            {
                foreach (TableRectangle tableArea in tableAreas)
                {
                    if (verticalRuling.Intersects(tableArea) &&
                        !(tableArea.Contains(verticalRuling.P1) && tableArea.Contains(verticalRuling.P2)))
                    {
                        tableArea.SetTop(Math.Ceiling(Math.Max(tableArea.Top, verticalRuling.Y2)));     // bobld: Floor and Min, Y1
                        tableArea.SetBottom(Math.Floor(Math.Min(tableArea.Bottom, verticalRuling.Y1))); // bobld: Ceiling and Max, Y2
                        break;
                    }
                }
            }

            /* BobLd: not sure this is the case in tabula-sharp/PdfPig
             * // the tabula Page coordinate space is half the size of the PDFBox image coordinate space
             * // so halve the table area size before proceeding and add a bit of padding to make sure we capture everything
             * foreach (TableRectangle area in tableAreas)
             * {
             *  area.x = (float)Math.floor(area.x / 2) - TABLE_PADDING_AMOUNT;
             *  area.y = (float)Math.floor(area.y / 2) - TABLE_PADDING_AMOUNT;
             *  area.width = (float)Math.ceil(area.width / 2) + TABLE_PADDING_AMOUNT;
             *  area.height = (float)Math.ceil(area.height / 2) + TABLE_PADDING_AMOUNT;
             * }
             *
             * // we're going to want halved horizontal lines later too
             * foreach (Ruling ruling in horizontalRulings) // Line2D.Float
             * {
             *  ruling.x1 = ruling.x1 / 2;
             *  ruling.y1 = ruling.y1 / 2;
             *  ruling.x2 = ruling.x2 / 2;
             *  ruling.y2 = ruling.y2 / 2;
             * }
             */

            // now look at text rows to help us find more tables and flesh out existing ones
            List <TextChunk> textChunks = TextElement.MergeWords(page.GetText());
            List <TableLine> lines      = TextChunk.GroupByLines(textChunks);

            // first look for text rows that intersect an existing table - those lines should probably be part of the table
            foreach (TableLine textRow in lines)
            {
                foreach (TableRectangle tableArea in tableAreas)
                {
                    if (!tableArea.Contains(textRow) && textRow.Intersects(tableArea))
                    {
                        tableArea.SetLeft(Math.Floor(Math.Min(textRow.Left, tableArea.Left)));
                        tableArea.SetRight(Math.Ceiling(Math.Max(textRow.Right, tableArea.Right)));
                    }
                }
            }

            // get rid of tables that DO NOT intersect any text areas - these are likely graphs or some sort of graphic
            //for (Iterator<Rectangle> iterator = tableAreas.iterator(); iterator.hasNext();)
            foreach (TableRectangle table in tableAreas.ToList()) // use tolist to be able to remove
            {
                bool intersectsText = false;
                foreach (TableLine textRow in lines)
                {
                    if (table.Intersects(textRow))
                    {
                        intersectsText = true;
                        break;
                    }
                }

                if (!intersectsText)
                {
                    tableAreas.Remove(table);
                }
            }

            // lastly, there may be some tables that don't have any vertical rulings at all
            // we'll use text edges we've found to try and guess which text rows are part of a table

            // in his thesis nurminen goes through every row to try to assign a probability that the line is in a table
            // we're going to try a general heuristic instead, trying to find what type of edge (left/right/mid) intersects
            // the most text rows, and then use that magic number of "relevant" edges to decide what text rows should be
            // part of a table.

            bool foundTable;

            do
            {
                foundTable = false;

                // get rid of any text lines contained within existing tables, this allows us to find more tables
                //for (Iterator<TableLine> iterator = lines.iterator(); iterator.hasNext();)
                foreach (var textRow in lines.ToList())
                {
                    foreach (TableRectangle table in tableAreas)
                    {
                        if (table.Contains(textRow))
                        {
                            lines.Remove(textRow);
                            break;
                        }
                    }
                }

                // get text edges from remaining lines in the document
                TextEdges textEdges = getTextEdges(lines);
                //List<TextEdge> leftTextEdges = textEdges[TextEdge.LEFT];
                //List<TextEdge> midTextEdges = textEdges[TextEdge.MID];
                //List<TextEdge> rightTextEdges = textEdges[TextEdge.RIGHT];

                // find the relevant text edges (the ones we think define where a table is)
                RelevantEdges relevantEdgeInfo = getRelevantEdges(textEdges, lines);

                // we found something relevant so let's look for rows that fit our criteria
                if (relevantEdgeInfo.edgeType != -1)
                {
                    List <TextEdge> relevantEdges = null;
                    switch (relevantEdgeInfo.edgeType)
                    {
                    case TextEdge.LEFT:
                        relevantEdges = textEdges[TextEdge.LEFT];       // leftTextEdges;
                        break;

                    case TextEdge.MID:
                        relevantEdges = textEdges[TextEdge.MID];        // midTextEdges;
                        break;

                    case TextEdge.RIGHT:
                        relevantEdges = textEdges[TextEdge.RIGHT];      // rightTextEdges;
                        break;
                    }

                    TableRectangle table = getTableFromText(lines, relevantEdges, relevantEdgeInfo.edgeCount, horizontalRulings);

                    if (table != null)
                    {
                        foundTable = true;
                        tableAreas.Add(table);
                    }
                }
            } while (foundTable);

            // create a set of our current tables that will eliminate duplicate tables
            SortedSet <TableRectangle> tableSet = new SortedSet <TableRectangle>(new TreeSetComparer()); //Set<Rectangle> tableSet = new TreeSet<>(new Comparator<Rectangle>() {...

            foreach (var table in tableAreas.OrderByDescending(t => t.Area))
            {
                tableSet.Add(table);
            }

            return(tableSet.ToList());
        }