/// <summary> /// /// </summary> /// <param name="page"></param> public bool IsTabular(PageArea page) { // if there's no text at all on the page, it's not a table // (we won't be able to do anything with it though) if (page.GetText().Count == 0) { return(false); } // get minimal region of page that contains every character (in effect, // removes white "margins") PageArea minimalRegion = page.GetArea(Utils.Bounds(page.GetText().Select(t => t.BoundingBox).ToList())); List <Table> tables = new SpreadsheetExtractionAlgorithm().Extract(minimalRegion); if (tables.Count == 0) { return(false); } Table table = tables[0]; int rowsDefinedByLines = table.RowCount; int colsDefinedByLines = table.ColumnCount; tables = new BasicExtractionAlgorithm().Extract(minimalRegion); if (tables.Count == 0) { // TODO WHAT DO WE DO HERE? System.Diagnostics.Debug.Write("SpreadsheetExtractionAlgorithm.isTabular(): no table found."); } table = tables[0]; int rowsDefinedWithoutLines = table.RowCount; int colsDefinedWithoutLines = table.ColumnCount; float ratio = (((float)colsDefinedByLines / colsDefinedWithoutLines) + ((float)rowsDefinedByLines / rowsDefinedWithoutLines)) / 2.0f; return(ratio > MAGIC_HEURISTIC_NUMBER && ratio < (1 / MAGIC_HEURISTIC_NUMBER)); }
public void TestTextElementsContainedInPage() { using (PdfDocument pdf_document = PdfDocument.Open("Resources/cs-en-us-pbms.pdf", new ParsingOptions() { ClipPaths = true })) { ObjectExtractor oe = new ObjectExtractor(pdf_document); PageArea page = oe.ExtractPage(1); foreach (TextElement te in page.GetText()) { Assert.True(page.BoundingBox.Contains(te.BoundingBox)); } } }
/// <summary> /// Extracts the tables in the page. /// </summary> /// <param name="page">The page where to extract the tables.</param> public List <Table> Extract(PageArea page) { List <TextElement> textElements = page.GetText(); if (textElements.Count == 0) { return(new Table[] { Table.EMPTY }.ToList()); } List <TextChunk> textChunks = this.verticalRulings == null?TextElement.MergeWords(page.GetText()) : TextElement.MergeWords(page.GetText(), this.verticalRulings); List <TableLine> lines = TextChunk.GroupByLines(textChunks); List <double> columns; if (this.verticalRulings != null) { // added by bobld: clipping verticalRulings because testExtractColumnsCorrectly2() fails var clippedVerticalRulings = Ruling.CropRulingsToArea(this.verticalRulings, page.BoundingBox); clippedVerticalRulings.Sort(new VerticalRulingComparer()); columns = new List <double>(clippedVerticalRulings.Count); foreach (Ruling vr in clippedVerticalRulings) { columns.Add(vr.Left); } /* * this.verticalRulings.Sort(new VerticalRulingComparer()); * columns = new List<double>(this.verticalRulings.Count); * foreach (Ruling vr in this.verticalRulings) * { * columns.Add(vr.getLeft()); * } */ } else { columns = ColumnPositions(lines); } // added by bobld: remove duplicates because testExtractColumnsCorrectly2() fails, // why do we need it here and not in the java version?? columns = columns.Distinct().ToList(); Table table = new Table(this); table.SetRect(page.BoundingBox); for (int i = 0; i < lines.Count; i++) { TableLine line = lines[i]; List <TextChunk> elements = line.TextElements.ToList(); elements.Sort(new TextChunkComparer()); foreach (TextChunk tc in elements) { if (tc.IsSameChar(TableLine.WHITE_SPACE_CHARS)) { continue; } int j = 0; bool found = false; for (; j < columns.Count; j++) { if (tc.Left <= columns[j]) { found = true; break; } } table.Add(new Cell(tc), i, found ? j : columns.Count); } } return(new Table[] { table }.ToList()); }
/// <summary> /// Extracts the tables in the page using rulings as separators. /// </summary> /// <param name="page"></param> /// <param name="rulings"></param> public List <Table> Extract(PageArea page, IReadOnlyList <Ruling> rulings) { // split rulings into horizontal and vertical List <Ruling> horizontalR = new List <Ruling>(); List <Ruling> verticalR = new List <Ruling>(); foreach (Ruling r in rulings) { if (r.IsHorizontal) { horizontalR.Add(r); } else if (r.IsVertical) { verticalR.Add(r); } } horizontalR = Ruling.CollapseOrientedRulings(horizontalR); verticalR = Ruling.CollapseOrientedRulings(verticalR); List <Cell> cells = FindCells(horizontalR, verticalR); List <TableRectangle> spreadsheetAreas = FindSpreadsheetsFromCells(cells.Cast <TableRectangle>().ToList()); List <Table> spreadsheets = new List <Table>(); foreach (TableRectangle area in spreadsheetAreas) { List <Cell> overlappingCells = new List <Cell>(); foreach (Cell c in cells) { if (c.Intersects(area)) { c.SetTextElements(TextElement.MergeWords(page.GetText(c.BoundingBox))); overlappingCells.Add(c); } } List <Ruling> horizontalOverlappingRulings = new List <Ruling>(); foreach (Ruling hr in horizontalR) { if (area.IntersectsLine(hr)) { horizontalOverlappingRulings.Add(hr); } } List <Ruling> verticalOverlappingRulings = new List <Ruling>(); foreach (Ruling vr in verticalR) { if (area.IntersectsLine(vr)) { verticalOverlappingRulings.Add(vr); } } TableWithRulingLines t = new TableWithRulingLines(area, overlappingCells, horizontalOverlappingRulings, verticalOverlappingRulings, this); spreadsheets.Add(t); } Utils.Sort(spreadsheets, new TableRectangle.ILL_DEFINED_ORDER()); return(spreadsheets); }
/// <summary> /// Detects the tables in the page. /// </summary> /// <param name="page"></param> public List <TableRectangle> Detect(PageArea page) { // get horizontal & vertical lines // we get these from an image of the PDF and not the PDF itself because sometimes there are invisible PDF // instructions that are interpreted incorrectly as visible elements - we really want to capture what a // person sees when they look at the PDF // BobLd: hack here, we don't convert to an image var pageRulings = page.GetRulings(); List <Ruling> horizontalRulings = this.getHorizontalRulings(pageRulings); List <Ruling> verticalRulings = this.getVerticalRulings(pageRulings); // end hack here List <Ruling> allEdges = new List <Ruling>(horizontalRulings); allEdges.AddRange(verticalRulings); List <TableRectangle> tableAreas = new List <TableRectangle>(); // if we found some edges, try to find some tables based on them if (allEdges.Count > 0) { // now we need to snap edge endpoints to a grid Utils.SnapPoints(allEdges, POINT_SNAP_DISTANCE_THRESHOLD, POINT_SNAP_DISTANCE_THRESHOLD); // normalize the rulings to make sure snapping didn't create any wacky non-horizontal/vertical rulings foreach (List <Ruling> rulings in new[] { horizontalRulings, verticalRulings }) //Arrays.asList(horizontalRulings, verticalRulings)) { //for (Iterator<Ruling> iterator = rulings.iterator(); iterator.hasNext();) foreach (var ruling in rulings.ToList()) // use ToList to be able to remove { ruling.Normalize(); if (ruling.IsOblique) { rulings.Remove(ruling); } } } // merge the edge lines into rulings - this makes finding edges between crossing points in the next step easier // we use a larger pixel expansion than the normal spreadsheet extraction method to cover gaps in the // edge detection/pixel snapping steps horizontalRulings = Ruling.CollapseOrientedRulings(horizontalRulings, 5); verticalRulings = Ruling.CollapseOrientedRulings(verticalRulings, 5); // use the rulings and points to find cells List <TableRectangle> cells = SpreadsheetExtractionAlgorithm.FindCells(horizontalRulings, verticalRulings).Cast <TableRectangle>().ToList(); // then use those cells to make table areas tableAreas = getTableAreasFromCells(cells); } // next find any vertical rulings that intersect tables - sometimes these won't have completely been captured as // cells if there are missing horizontal lines (which there often are) // let's assume though that these lines should be part of the table foreach (Ruling verticalRuling in verticalRulings) // Line2D.Float { foreach (TableRectangle tableArea in tableAreas) { if (verticalRuling.Intersects(tableArea) && !(tableArea.Contains(verticalRuling.P1) && tableArea.Contains(verticalRuling.P2))) { tableArea.SetTop(Math.Ceiling(Math.Max(tableArea.Top, verticalRuling.Y2))); // bobld: Floor and Min, Y1 tableArea.SetBottom(Math.Floor(Math.Min(tableArea.Bottom, verticalRuling.Y1))); // bobld: Ceiling and Max, Y2 break; } } } /* BobLd: not sure this is the case in tabula-sharp/PdfPig * // the tabula Page coordinate space is half the size of the PDFBox image coordinate space * // so halve the table area size before proceeding and add a bit of padding to make sure we capture everything * foreach (TableRectangle area in tableAreas) * { * area.x = (float)Math.floor(area.x / 2) - TABLE_PADDING_AMOUNT; * area.y = (float)Math.floor(area.y / 2) - TABLE_PADDING_AMOUNT; * area.width = (float)Math.ceil(area.width / 2) + TABLE_PADDING_AMOUNT; * area.height = (float)Math.ceil(area.height / 2) + TABLE_PADDING_AMOUNT; * } * * // we're going to want halved horizontal lines later too * foreach (Ruling ruling in horizontalRulings) // Line2D.Float * { * ruling.x1 = ruling.x1 / 2; * ruling.y1 = ruling.y1 / 2; * ruling.x2 = ruling.x2 / 2; * ruling.y2 = ruling.y2 / 2; * } */ // now look at text rows to help us find more tables and flesh out existing ones List <TextChunk> textChunks = TextElement.MergeWords(page.GetText()); List <TableLine> lines = TextChunk.GroupByLines(textChunks); // first look for text rows that intersect an existing table - those lines should probably be part of the table foreach (TableLine textRow in lines) { foreach (TableRectangle tableArea in tableAreas) { if (!tableArea.Contains(textRow) && textRow.Intersects(tableArea)) { tableArea.SetLeft(Math.Floor(Math.Min(textRow.Left, tableArea.Left))); tableArea.SetRight(Math.Ceiling(Math.Max(textRow.Right, tableArea.Right))); } } } // get rid of tables that DO NOT intersect any text areas - these are likely graphs or some sort of graphic //for (Iterator<Rectangle> iterator = tableAreas.iterator(); iterator.hasNext();) foreach (TableRectangle table in tableAreas.ToList()) // use tolist to be able to remove { bool intersectsText = false; foreach (TableLine textRow in lines) { if (table.Intersects(textRow)) { intersectsText = true; break; } } if (!intersectsText) { tableAreas.Remove(table); } } // lastly, there may be some tables that don't have any vertical rulings at all // we'll use text edges we've found to try and guess which text rows are part of a table // in his thesis nurminen goes through every row to try to assign a probability that the line is in a table // we're going to try a general heuristic instead, trying to find what type of edge (left/right/mid) intersects // the most text rows, and then use that magic number of "relevant" edges to decide what text rows should be // part of a table. bool foundTable; do { foundTable = false; // get rid of any text lines contained within existing tables, this allows us to find more tables //for (Iterator<TableLine> iterator = lines.iterator(); iterator.hasNext();) foreach (var textRow in lines.ToList()) { foreach (TableRectangle table in tableAreas) { if (table.Contains(textRow)) { lines.Remove(textRow); break; } } } // get text edges from remaining lines in the document TextEdges textEdges = getTextEdges(lines); //List<TextEdge> leftTextEdges = textEdges[TextEdge.LEFT]; //List<TextEdge> midTextEdges = textEdges[TextEdge.MID]; //List<TextEdge> rightTextEdges = textEdges[TextEdge.RIGHT]; // find the relevant text edges (the ones we think define where a table is) RelevantEdges relevantEdgeInfo = getRelevantEdges(textEdges, lines); // we found something relevant so let's look for rows that fit our criteria if (relevantEdgeInfo.edgeType != -1) { List <TextEdge> relevantEdges = null; switch (relevantEdgeInfo.edgeType) { case TextEdge.LEFT: relevantEdges = textEdges[TextEdge.LEFT]; // leftTextEdges; break; case TextEdge.MID: relevantEdges = textEdges[TextEdge.MID]; // midTextEdges; break; case TextEdge.RIGHT: relevantEdges = textEdges[TextEdge.RIGHT]; // rightTextEdges; break; } TableRectangle table = getTableFromText(lines, relevantEdges, relevantEdgeInfo.edgeCount, horizontalRulings); if (table != null) { foundTable = true; tableAreas.Add(table); } } } while (foundTable); // create a set of our current tables that will eliminate duplicate tables SortedSet <TableRectangle> tableSet = new SortedSet <TableRectangle>(new TreeSetComparer()); //Set<Rectangle> tableSet = new TreeSet<>(new Comparator<Rectangle>() {... foreach (var table in tableAreas.OrderByDescending(t => t.Area)) { tableSet.Add(table); } return(tableSet.ToList()); }