public void TestDetectTwoSingleCells() { List <Cell> cells = SpreadsheetExtractionAlgorithm.FindCells(TWO_SINGLE_CELL_RULINGS[0].ToList(), TWO_SINGLE_CELL_RULINGS[1].ToList()); Assert.Equal(2, cells.Count); // should not overlap Assert.False(cells[0].Intersects(cells[1])); }
/// <summary> /// Detects the tables in the page. /// </summary> /// <param name="page">The page where to detect the tables.</param> public List<TableRectangle> Detect(PageArea page) { List<Cell> cells = SpreadsheetExtractionAlgorithm.FindCells(page.HorizontalRulings, page.VerticalRulings); List<TableRectangle> tables = SpreadsheetExtractionAlgorithm.FindSpreadsheetsFromCells(cells.Cast<TableRectangle>().ToList()); // we want tables to be returned from top to bottom on the page Utils.Sort(tables, new TableRectangle.ILL_DEFINED_ORDER()); return tables; }
public void TestDetectSingleCell() { List <Cell> cells = SpreadsheetExtractionAlgorithm.FindCells(SINGLE_CELL_RULINGS[0].ToList(), SINGLE_CELL_RULINGS[1].ToList()); Assert.Single(cells); Cell cell = cells[0]; Assert.True(Utils.Feq(151.65355, cell.Left)); Assert.True(Utils.Feq(185.6693, cell.Bottom)); // .getTop() Assert.True(Utils.Feq(229.08083, cell.Width)); Assert.True(Utils.Feq(128.97636, cell.Height)); }
public void TestLinesToCells() { List <Cell> cells = SpreadsheetExtractionAlgorithm.FindCells(HORIZONTAL_RULING_LINES.ToList(), VERTICAL_RULING_LINES.ToList()); Utils.Sort(cells, new TableRectangle.ILL_DEFINED_ORDER()); List <Cell> expected = EXPECTED_CELLS.ToList(); Utils.Sort(expected, new TableRectangle.ILL_DEFINED_ORDER()); Assert.Equal(expected.Count, cells.Count); for (int i = 0; i < expected.Count; i++) { Assert.Equal(expected[i], cells[i]); } Assert.Equal(expected, cells); }
/// <summary> /// Detects the tables in the page. /// </summary> /// <param name="page"></param> public List <TableRectangle> Detect(PageArea page) { // get horizontal & vertical lines // we get these from an image of the PDF and not the PDF itself because sometimes there are invisible PDF // instructions that are interpreted incorrectly as visible elements - we really want to capture what a // person sees when they look at the PDF // BobLd: hack here, we don't convert to an image var pageRulings = page.GetRulings(); List <Ruling> horizontalRulings = this.getHorizontalRulings(pageRulings); List <Ruling> verticalRulings = this.getVerticalRulings(pageRulings); // end hack here List <Ruling> allEdges = new List <Ruling>(horizontalRulings); allEdges.AddRange(verticalRulings); List <TableRectangle> tableAreas = new List <TableRectangle>(); // if we found some edges, try to find some tables based on them if (allEdges.Count > 0) { // now we need to snap edge endpoints to a grid Utils.SnapPoints(allEdges, POINT_SNAP_DISTANCE_THRESHOLD, POINT_SNAP_DISTANCE_THRESHOLD); // normalize the rulings to make sure snapping didn't create any wacky non-horizontal/vertical rulings foreach (List <Ruling> rulings in new[] { horizontalRulings, verticalRulings }) //Arrays.asList(horizontalRulings, verticalRulings)) { //for (Iterator<Ruling> iterator = rulings.iterator(); iterator.hasNext();) foreach (var ruling in rulings.ToList()) // use ToList to be able to remove { ruling.Normalize(); if (ruling.IsOblique) { rulings.Remove(ruling); } } } // merge the edge lines into rulings - this makes finding edges between crossing points in the next step easier // we use a larger pixel expansion than the normal spreadsheet extraction method to cover gaps in the // edge detection/pixel snapping steps horizontalRulings = Ruling.CollapseOrientedRulings(horizontalRulings, 5); verticalRulings = Ruling.CollapseOrientedRulings(verticalRulings, 5); // use the rulings and points to find cells List <TableRectangle> cells = SpreadsheetExtractionAlgorithm.FindCells(horizontalRulings, verticalRulings).Cast <TableRectangle>().ToList(); // then use those cells to make table areas tableAreas = getTableAreasFromCells(cells); } // next find any vertical rulings that intersect tables - sometimes these won't have completely been captured as // cells if there are missing horizontal lines (which there often are) // let's assume though that these lines should be part of the table foreach (Ruling verticalRuling in verticalRulings) // Line2D.Float { foreach (TableRectangle tableArea in tableAreas) { if (verticalRuling.Intersects(tableArea) && !(tableArea.Contains(verticalRuling.P1) && tableArea.Contains(verticalRuling.P2))) { tableArea.SetTop(Math.Ceiling(Math.Max(tableArea.Top, verticalRuling.Y2))); // bobld: Floor and Min, Y1 tableArea.SetBottom(Math.Floor(Math.Min(tableArea.Bottom, verticalRuling.Y1))); // bobld: Ceiling and Max, Y2 break; } } } /* BobLd: not sure this is the case in tabula-sharp/PdfPig * // the tabula Page coordinate space is half the size of the PDFBox image coordinate space * // so halve the table area size before proceeding and add a bit of padding to make sure we capture everything * foreach (TableRectangle area in tableAreas) * { * area.x = (float)Math.floor(area.x / 2) - TABLE_PADDING_AMOUNT; * area.y = (float)Math.floor(area.y / 2) - TABLE_PADDING_AMOUNT; * area.width = (float)Math.ceil(area.width / 2) + TABLE_PADDING_AMOUNT; * area.height = (float)Math.ceil(area.height / 2) + TABLE_PADDING_AMOUNT; * } * * // we're going to want halved horizontal lines later too * foreach (Ruling ruling in horizontalRulings) // Line2D.Float * { * ruling.x1 = ruling.x1 / 2; * ruling.y1 = ruling.y1 / 2; * ruling.x2 = ruling.x2 / 2; * ruling.y2 = ruling.y2 / 2; * } */ // now look at text rows to help us find more tables and flesh out existing ones List <TextChunk> textChunks = TextElement.MergeWords(page.GetText()); List <TableLine> lines = TextChunk.GroupByLines(textChunks); // first look for text rows that intersect an existing table - those lines should probably be part of the table foreach (TableLine textRow in lines) { foreach (TableRectangle tableArea in tableAreas) { if (!tableArea.Contains(textRow) && textRow.Intersects(tableArea)) { tableArea.SetLeft(Math.Floor(Math.Min(textRow.Left, tableArea.Left))); tableArea.SetRight(Math.Ceiling(Math.Max(textRow.Right, tableArea.Right))); } } } // get rid of tables that DO NOT intersect any text areas - these are likely graphs or some sort of graphic //for (Iterator<Rectangle> iterator = tableAreas.iterator(); iterator.hasNext();) foreach (TableRectangle table in tableAreas.ToList()) // use tolist to be able to remove { bool intersectsText = false; foreach (TableLine textRow in lines) { if (table.Intersects(textRow)) { intersectsText = true; break; } } if (!intersectsText) { tableAreas.Remove(table); } } // lastly, there may be some tables that don't have any vertical rulings at all // we'll use text edges we've found to try and guess which text rows are part of a table // in his thesis nurminen goes through every row to try to assign a probability that the line is in a table // we're going to try a general heuristic instead, trying to find what type of edge (left/right/mid) intersects // the most text rows, and then use that magic number of "relevant" edges to decide what text rows should be // part of a table. bool foundTable; do { foundTable = false; // get rid of any text lines contained within existing tables, this allows us to find more tables //for (Iterator<TableLine> iterator = lines.iterator(); iterator.hasNext();) foreach (var textRow in lines.ToList()) { foreach (TableRectangle table in tableAreas) { if (table.Contains(textRow)) { lines.Remove(textRow); break; } } } // get text edges from remaining lines in the document TextEdges textEdges = getTextEdges(lines); //List<TextEdge> leftTextEdges = textEdges[TextEdge.LEFT]; //List<TextEdge> midTextEdges = textEdges[TextEdge.MID]; //List<TextEdge> rightTextEdges = textEdges[TextEdge.RIGHT]; // find the relevant text edges (the ones we think define where a table is) RelevantEdges relevantEdgeInfo = getRelevantEdges(textEdges, lines); // we found something relevant so let's look for rows that fit our criteria if (relevantEdgeInfo.edgeType != -1) { List <TextEdge> relevantEdges = null; switch (relevantEdgeInfo.edgeType) { case TextEdge.LEFT: relevantEdges = textEdges[TextEdge.LEFT]; // leftTextEdges; break; case TextEdge.MID: relevantEdges = textEdges[TextEdge.MID]; // midTextEdges; break; case TextEdge.RIGHT: relevantEdges = textEdges[TextEdge.RIGHT]; // rightTextEdges; break; } TableRectangle table = getTableFromText(lines, relevantEdges, relevantEdgeInfo.edgeCount, horizontalRulings); if (table != null) { foundTable = true; tableAreas.Add(table); } } } while (foundTable); // create a set of our current tables that will eliminate duplicate tables SortedSet <TableRectangle> tableSet = new SortedSet <TableRectangle>(new TreeSetComparer()); //Set<Rectangle> tableSet = new TreeSet<>(new Comparator<Rectangle>() {... foreach (var table in tableAreas.OrderByDescending(t => t.Area)) { tableSet.Add(table); } return(tableSet.ToList()); }