public void TestCompareVerticalOverlapRectangle() { TableRectangle lower = new TableRectangle(new PdfRectangle(0, 0, 10, 5)); //5f, 0f, 10f, 10f); TableRectangle upper = new TableRectangle(new PdfRectangle(10, 0, 20, 10)); //0f, 10f, 10f, 10f); Assert.True(lower.CompareTo(upper) < 0); }
public void TestCompareAlignedVerticalRectangle() { TableRectangle lower = new TableRectangle(new PdfRectangle(0, 10, 10, 20)); //10f, 0f, 10f, 10f); TableRectangle upper = new TableRectangle(new PdfRectangle(0, 20, 10, 30)); //20f, 0f, 10f, 10f); Assert.True(lower.CompareTo(upper) > 0); // upper precedes lower (reading order) // was < 0 }
public void TestCompareVerticalOverlapLessThresholdRectangle() { TableRectangle lower = new TableRectangle(new PdfRectangle(10, 0, 20, 10)); //0f, 10f, 10f, 10f); TableRectangle upper = new TableRectangle(new PdfRectangle(0, 9.8, 10, 19.8)); //9.8f, 0f, 10f, 10f); Assert.True(lower.CompareTo(upper) > 0); // upper precedes lower (reading order) // was < 0 }
public void TestCompareAlignedHorizontalRectangle() { TableRectangle lower = new TableRectangle(new PdfRectangle(10, 0, 20, 10)); //0f, 10f, 10f, 10f)); TableRectangle upper = new TableRectangle(new PdfRectangle(20, 0, 30, 10)); //0f, 20f, 10f, 10f)); Assert.True(lower.CompareTo(upper) < 0); }
public void TestQuickSortRectangleList() { // Testing wrong sorting // Expected: AARON, JOSHUA, N // but was: AARON JOSHUA N , , TableRectangle first = new TableRectangle(new PdfRectangle(51.47999954223633, 172.92999267578125, 51.47999954223633 + 4.0, 172.92999267578125 + 4.309999942779541)); // 172.92999267578125f, 51.47999954223633f, 4.0f, 4.309999942779541f); //A Assert.Equal(4, first.Width); Assert.Equal(4.309999942779541, first.Height); TableRectangle second = new TableRectangle(new PdfRectangle(72.72000122070312, 175.72000122070312, 72.72000122070312 + 1.6699999570846558, 175.72000122070312 + 1.5199999809265137)); //175.72000122070312f, 72.72000122070312f, 1.6699999570846558f, 1.5199999809265137f); //, Assert.Equal(1.6699999570846558, second.Width); Assert.Equal(1.5199999809265137, second.Height); TableRectangle third = new TableRectangle(new PdfRectangle(96.36000061035156, 172.92999267578125, 96.36000061035156 + 4.0, 172.92999267578125 + 4.309999942779541)); //172.92999267578125f, 96.36000061035156f, 4.0f, 4.309999942779541f); //A Assert.Equal(4.0, third.Width); Assert.Equal(4.309999942779541, third.Height); TableRectangle fourth = new TableRectangle(new PdfRectangle(100.31999969482422, 175.72000122070312, 100.31999969482422 + 1.6699999570846558, 175.72000122070312 + 1.5199999809265137)); //175.72000122070312f, 100.31999969482422f, 1.6699999570846558f, 1.5199999809265137f); //, Assert.Equal(1.6699999570846558, fourth.Width); Assert.Equal(1.5199999809265137, fourth.Height); TableRectangle fifth = new TableRectangle(new PdfRectangle(103.68000030517578, 172.92999267578125, 103.68000030517578 + 4.329999923706055, 172.92999267578125 + 4.309999942779541)); //172.92999267578125f, 103.68000030517578f, 4.329999923706055f, 4.309999942779541f); //N Assert.Equal(4.329999923706055, fifth.Width); Assert.Equal(4.309999942779541, fifth.Height); TableRectangle sixth = new TableRectangle(new PdfRectangle(161.16000366210938, 169.2100067138672, 161.16000366210938 + 4.329999923706055, 169.2100067138672 + 4.309999942779541)); //169.2100067138672f, 161.16000366210938f, 4.329999923706055f, 4.309999942779541f); //R Assert.Equal(4.329999923706055, sixth.Width); Assert.Equal(4.309999942779541, sixth.Height); List <TableRectangle> expectedList = new List <TableRectangle> { first, //sixth, second, third, fourth, fifth, sixth, // put here, follows reading order }; List <TableRectangle> toSortList = new List <TableRectangle> { sixth, second, third, fifth, first, fourth }; Utils.Sort(toSortList, new TableRectangle.ILL_DEFINED_ORDER()); //Collections.sort(toSortList, TableRectangle.ILL_DEFINED_ORDER); Assert.Equal(expectedList, toSortList); }
public void TestGetHorizontalOverlapShouldReturnZero() { TableRectangle one = new TableRectangle(new PdfRectangle(0, 0, 10, 10)); //0f, 0f, 10f, 10f); TableRectangle two = new TableRectangle(new PdfRectangle(10, 10, 20, 20)); //10f, 10f, 10f, 10f); Assert.True(!one.HorizontallyOverlaps(two)); Assert.Equal(0f, one.OverlapRatio(two), 0); }
public void TestCompareEqualsRectangles() { TableRectangle first = new TableRectangle(); TableRectangle second = new TableRectangle(); Assert.True(first.Equals(second)); Assert.True(second.Equals(first)); }
public void TestBoundsOfTwoRulings() { TableRectangle r = new TableRectangle(Utils.Bounds(RULINGS)); //RULINGS.ToList(); Assert.Equal(0, r.MinX, 0); Assert.Equal(0, r.MinY, 0); Assert.Equal(3, r.Width, 0); Assert.Equal(3, r.Height, 0); }
public void TestBoundsOfOneRectangle() { List <TableRectangle> shapes = new List <TableRectangle> { new TableRectangle(new PdfRectangle(0, 0, 20, 40)) }; TableRectangle r = Utils.Bounds(shapes); Assert.Equal(r, shapes[0]); }
public void TestIntersects() { TableRectangle r = new TableRectangle(new PdfRectangle()); RectangleSpatialIndex <TableRectangle> rSpatialIndex = new RectangleSpatialIndex <TableRectangle>(); rSpatialIndex.Add(r); Assert.True(rSpatialIndex.Intersects(r).Count > 0); }
public void TestGetOverlapShouldReturnMoreThanZero() { TableRectangle one = new TableRectangle(new PdfRectangle(0, 0, 10, 10)); // 0f, 0f, 10f, 10f); TableRectangle two = new TableRectangle(new PdfRectangle(5, 5, 15, 15)); //5f, 5f, 10f, 10f); Assert.True(one.HorizontallyOverlaps(two)); Assert.True(one.VerticallyOverlaps(two)); Assert.Equal(5f, one.HorizontalOverlap(two), 0); Assert.Equal(5f, one.VerticalOverlap(two), 0); Assert.Equal(25f / 175, one.OverlapRatio(two), 0); }
public void TestGetVerticalOverlapShouldReturnMoreThanZero() { TableRectangle lower = new TableRectangle(new PdfRectangle(10, 15, 20, 25)); //15f, 10f, 10f, 10f); TableRectangle upper = new TableRectangle(new PdfRectangle(0, 20, 10, 30)); //20f, 0f, 10f, 10f); double overlap = lower.VerticalOverlap(upper); Assert.Equal(5, overlap, 0); Assert.True(lower.VerticallyOverlaps(upper)); Assert.Equal(0.5, lower.VerticalOverlapRatio(upper), 0); Assert.Equal(0, lower.OverlapRatio(upper), 0); }
public void TestGetBoundingBox() { List <TableRectangle> rectangles = new List <TableRectangle> { new TableRectangle(new PdfRectangle(0, 0, 10, 10)), //0f, 0f, 10f, 10f) new TableRectangle(new PdfRectangle(30, 10, 40, 20)) //20f, 30f, 10f, 10f) }; TableRectangle boundingBoxOf = TableRectangle.BoundingBoxOf(rectangles); Assert.Equal(new TableRectangle(new PdfRectangle(0, 0, 40, 20)), boundingBoxOf); // 0f, 0f, 40f, 30f) }
public void TestMergeOverlappingRectangles() { TableRectangle one = new TableRectangle(new PdfRectangle(0, 0, 10, 10)); //0f, 0f, 10f, 10f); TableRectangle two = new TableRectangle(new PdfRectangle(5, 5, 15, 15)); //5f, 5f, 10f, 10f); one.Merge(two); Assert.Equal(15f, one.Width, 0); Assert.Equal(15f, one.Height, 0); Assert.Equal(0f, one.Left, 0); Assert.Equal(0f, one.Bottom, 0); // one.getTop() Assert.Equal(15, one.Top, 0); }
public void TestMergeNoOverlappingRectangles() { TableRectangle one = new TableRectangle(new PdfRectangle(0, 0, 10, 10)); //0f, 0f, 10f, 10f); TableRectangle two = new TableRectangle(new PdfRectangle(10, 0, 20, 10)); //0f, 10f, 10f, 10f); one.Merge(two); Assert.Equal(20f, one.Width, 0); Assert.Equal(10f, one.Height, 0); Assert.Equal(0f, one.Left, 0); Assert.Equal(10, one.Top, 0); //0f, one.getTop(), 0); Assert.Equal(0, one.Bottom, 0); //10f, one.getBottom(), 0); Assert.Equal(20f * 10f, one.Area, 0); }
public void TestQuickSortOneUpperThanOther() { TableRectangle lower = new TableRectangle(new PdfRectangle(72.72, 175.72, 72.72 + 1.67, 175.72 + 1.52)); //175.72f, 72.72f, 1.67f, 1.52f); //, (Comma after AARON) Assert.Equal(1.67, lower.Width, 2); Assert.Equal(1.52, lower.Height, 2); TableRectangle upper = new TableRectangle(new PdfRectangle(161.16, 169.21, 161.16 + 4.33, 169.21 + 4.31)); //169.21f, 161.16f, 4.33f, 4.31f); // R (REGIONAL PULMONARY) Assert.Equal(4.33, upper.Width, 2); Assert.Equal(4.31, upper.Height, 2); Assert.True(lower.CompareTo(upper) < 0); // > 0 }
[Fact] //(Skip = "Comparison is not transitive. Transitivity needs to be implemented.")] public void TestTransitiveComparison1() { // +-------+ // | | // | a | +-------+ // | | | | // +-------+ | b | +-------+ // | | | | // +-------+ | c | // | | // +-------+ TableRectangle a = new TableRectangle(new PdfRectangle(0, 2, 2, 4)); TableRectangle b = new TableRectangle(new PdfRectangle(1, 1, 3, 3)); TableRectangle c = new TableRectangle(new PdfRectangle(2, 0, 4, 2)); Assert.True(a.CompareTo(b) < 0); Assert.True(b.CompareTo(c) < 0); Assert.True(a.CompareTo(c) < 0); }
public void TestRectangleGetPoints() { TableRectangle one = new TableRectangle(new PdfRectangle(20, 10, 50, 50)); //10f, 20f, 30f, 40f); Assert.Equal(30, one.Width); Assert.Equal(40, one.Height); PdfPoint[] points = one.Points; PdfPoint[] expectedPoints = new PdfPoint[] { new PdfPoint(20, 10), new PdfPoint(50, 10), new PdfPoint(50, 50), new PdfPoint(20, 50) }; Assert.Equal(expectedPoints, points); }
public void TestTransitiveComparison2() { // need to rewrite // +-------+ // | | // +-------+ | C | // | | | | // +-------+ | B | +-------+ // | | | | // | A | +-------+ // | | // +-------+ TableRectangle c = new TableRectangle(new PdfRectangle(0, 2, 2, 4)); // 2, 0, 2, 2); // a TableRectangle b = new TableRectangle(new PdfRectangle(1, 1, 3, 3)); // 1, 1, 2, 2); TableRectangle a = new TableRectangle(new PdfRectangle(2, 0, 4, 2)); // 0, 2, 2, 2); // c Assert.True(a.CompareTo(b) < 0); Assert.True(b.CompareTo(c) < 0); Assert.True(a.CompareTo(c) < 0); }
public void TestNaturalOrderOfRectanglesOneMoreTime() { var parse = UtilsForTesting.LoadCsvLines("Resources/csv/TestBasicExtractor-RECTANGLE_TEST_NATURAL_ORDER.csv"); List <TableRectangle> rectangles = new List <TableRectangle>(); foreach (var record in parse) { var top = double.Parse(record[0]); var left = double.Parse(record[1]); double w = double.Parse(record[2]); double h = double.Parse(record[3]); rectangles.Add(new TableRectangle(new PdfRectangle(left, top, left + w, top + h))); } Utils.Sort(rectangles, new TableRectangle.ILL_DEFINED_ORDER()); for (int i = 0; i < rectangles.Count - 1; i++) { TableRectangle rectangle = rectangles[i]; TableRectangle nextRectangle = rectangles[i + 1]; Assert.True(rectangle.CompareTo(nextRectangle) < 0); } }
private TableRectangle getTableFromText(List <TableLine> lines, List <TextEdge> relevantEdges, int relevantEdgeCount, List <Ruling> horizontalRulings) { TableRectangle table = new TableRectangle(); TableLine prevRow = null; TableLine firstTableRow = null; TableLine lastTableRow = null; int tableSpaceCount = 0; double totalRowSpacing = 0; // go through the lines and find the ones that have the correct count of the relevant edges foreach (TableLine textRow in lines) { int numRelevantEdges = 0; if (firstTableRow != null && tableSpaceCount > 0) { // check to make sure this text row is within a line or so of the other lines already added // if it's not, we should stop the table here double tableLineThreshold = (totalRowSpacing / tableSpaceCount) * 2.5; double lineDistance = prevRow.Bottom - textRow.Bottom; // bobld: textRow.Top - prevRow.Top System.Diagnostics.Debug.Assert(lineDistance >= 0); if (lineDistance > tableLineThreshold) { lastTableRow = prevRow; break; } } // for larger tables, be a little lenient on the number of relevant rows the text intersects // for smaller tables, not so much - otherwise we'll end up treating paragraphs as tables too int relativeEdgeDifferenceThreshold = 1; if (relevantEdgeCount <= 3) { relativeEdgeDifferenceThreshold = 0; } foreach (TextEdge edge in relevantEdges) { if (textRow.IntersectsLine(edge.Line)) { numRelevantEdges++; } } // see if we have a candidate text row if (numRelevantEdges >= (relevantEdgeCount - relativeEdgeDifferenceThreshold)) { // keep track of table row spacing if (prevRow != null && firstTableRow != null) { tableSpaceCount++; totalRowSpacing += prevRow.Bottom - textRow.Bottom; // bobld: textRow.Top - prevRow.Top } // row is part of a table if (table.Area == 0) { firstTableRow = textRow; table.SetRect(textRow); } else { table.SetLeft(Math.Min(table.Left, textRow.Left)); table.SetBottom(Math.Min(table.Bottom, textRow.Bottom)); // bobld: Max table.SetRight(Math.Max(table.Right, textRow.Right)); } } else { // no dice // if we're at the end of the table, save the last row if (firstTableRow != null && lastTableRow == null) { lastTableRow = prevRow; } } prevRow = textRow; } // if we don't have a table now, we won't after the next step either if (table.Area == 0) { return(null); } if (lastTableRow == null) { // takes care of one-row tables or tables that end at the bottom of a page lastTableRow = prevRow; } // use the average row height and nearby horizontal lines to extend the table area double avgRowHeight; if (tableSpaceCount > 0) { System.Diagnostics.Debug.Assert(totalRowSpacing >= 0); avgRowHeight = totalRowSpacing / tableSpaceCount; } else { avgRowHeight = lastTableRow.Height; } double rowHeightThreshold = avgRowHeight * 1.5; // check lines after the bottom of the table //foreach (Ruling ruling in sortedHorizontalRulings) //Line2D.Float for (int i = horizontalRulings.Count - 1; i >= 0; i--) // reverse order { var ruling = horizontalRulings[i]; if (ruling.Y1 > table.Bottom) // bobld: < { continue; } double distanceFromTable = table.Bottom - ruling.Y2; // bobld: Y1 System.Diagnostics.Debug.Assert(distanceFromTable >= 0); if (distanceFromTable <= rowHeightThreshold) { // use this ruling to help define the table table.SetBottom(Math.Min(table.Bottom, ruling.Y2)); // bobld: Max Y1 table.SetLeft(Math.Min(table.Left, ruling.X1)); table.SetRight(Math.Max(table.Right, ruling.X2)); } else { // no use checking any further break; } } // do the same for lines at the top, but make the threshold greater since table headings tend to be // larger to fit up to three-ish rows of text (at least but we don't want to grab too much) rowHeightThreshold = avgRowHeight * 3.8; //for (int i = horizontalRulings.Count - 1; i >= 0; i--) for (int i = 0; i < horizontalRulings.Count; i++) { Ruling ruling = horizontalRulings[i]; if (ruling.Y1 < table.Top) //bobld: > { continue; } double distanceFromTable = ruling.Y1 - table.Top; // bobld: table.Top - ruling.Y1 System.Diagnostics.Debug.Assert(distanceFromTable >= 0); if (distanceFromTable <= rowHeightThreshold) { table.SetTop(Math.Max(table.Top, ruling.Y2)); // bobld: Min Y1 table.SetLeft(Math.Min(table.Left, ruling.X1)); table.SetRight(Math.Max(table.Right, ruling.X2)); } else { break; } } // add a bit of padding since the halved horizontal lines are a little fuzzy anyways table.SetTop(Math.Ceiling(table.Top) + TABLE_PADDING_AMOUNT); // bobld: Floor - table.SetBottom(Math.Floor(table.Bottom) - TABLE_PADDING_AMOUNT); // bobld: Ceiling + table.SetLeft(Math.Floor(table.Left) - TABLE_PADDING_AMOUNT); table.SetRight(Math.Ceiling(table.Right) + TABLE_PADDING_AMOUNT); return(table); }
public void TestBoundsOfOneEmptyRectangleAndAnotherNonEmpty() { TableRectangle r = Utils.Bounds(RECTANGLES.ToList()); Assert.Equal(r, RECTANGLES[1]); }
/// <summary> /// Gets columns positions. /// </summary> /// <param name="lines">Must be an array of lines sorted by their +top+ attribute.</param> /// <returns>a list of column boundaries (x axis).</returns> public static List <double> ColumnPositions(IReadOnlyList <TableLine> lines) { List <TableRectangle> regions = new List <TableRectangle>(); foreach (TextChunk tc in lines[0].TextElements) { if (tc.IsSameChar(TableLine.WHITE_SPACE_CHARS)) { continue; } TableRectangle r = new TableRectangle(); r.SetRect(tc); regions.Add(r); } foreach (TableLine l in lines.SubList(1, lines.Count)) { List <TextChunk> lineTextElements = new List <TextChunk>(); foreach (TextChunk tc in l.TextElements) { if (!tc.IsSameChar(TableLine.WHITE_SPACE_CHARS)) { lineTextElements.Add(tc); } } foreach (TableRectangle cr in regions) { List <TextChunk> overlaps = new List <TextChunk>(); foreach (TextChunk te in lineTextElements) { if (cr.HorizontallyOverlaps(te)) { overlaps.Add(te); } } foreach (TextChunk te in overlaps) { cr.Merge(te); } foreach (var rem in overlaps) { lineTextElements.Remove(rem); } } // added by bobld // We need more checks here /* * foreach (TextChunk te in lineTextElements) * { * TableRectangle r = new TableRectangle(); * r.setRect(te); * regions.Add(r); * } */ if (lineTextElements.Count > 0) { // because testExtractColumnsCorrectly3() fails // need to check here if the remaining te in lineTextElements do overlap among themselves // might happen with multiline cell TableRectangle r = new TableRectangle(); r.SetRect(lineTextElements[0]); foreach (var rem in lineTextElements.SubList(1, lineTextElements.Count)) { if (r.HorizontallyOverlaps(rem)) { // they overlap! // so this is multiline cell r.Merge(rem); } else { regions.Add(r); // do not overlap (anymore), so add it r = new TableRectangle(); r.SetRect(rem); //regions.Add(r); } } regions.Add(r); } // end added } List <double> rv = new List <double>(); foreach (TableRectangle r in regions) { rv.Add(r.Right); } rv.Sort(); //Collections.sort(rv); return(rv); }
/// <summary> /// Detects the tables in the page. /// </summary> /// <param name="page"></param> public List <TableRectangle> Detect(PageArea page) { // get horizontal & vertical lines // we get these from an image of the PDF and not the PDF itself because sometimes there are invisible PDF // instructions that are interpreted incorrectly as visible elements - we really want to capture what a // person sees when they look at the PDF // BobLd: hack here, we don't convert to an image var pageRulings = page.GetRulings(); List <Ruling> horizontalRulings = this.getHorizontalRulings(pageRulings); List <Ruling> verticalRulings = this.getVerticalRulings(pageRulings); // end hack here List <Ruling> allEdges = new List <Ruling>(horizontalRulings); allEdges.AddRange(verticalRulings); List <TableRectangle> tableAreas = new List <TableRectangle>(); // if we found some edges, try to find some tables based on them if (allEdges.Count > 0) { // now we need to snap edge endpoints to a grid Utils.SnapPoints(allEdges, POINT_SNAP_DISTANCE_THRESHOLD, POINT_SNAP_DISTANCE_THRESHOLD); // normalize the rulings to make sure snapping didn't create any wacky non-horizontal/vertical rulings foreach (List <Ruling> rulings in new[] { horizontalRulings, verticalRulings }) //Arrays.asList(horizontalRulings, verticalRulings)) { //for (Iterator<Ruling> iterator = rulings.iterator(); iterator.hasNext();) foreach (var ruling in rulings.ToList()) // use ToList to be able to remove { ruling.Normalize(); if (ruling.IsOblique) { rulings.Remove(ruling); } } } // merge the edge lines into rulings - this makes finding edges between crossing points in the next step easier // we use a larger pixel expansion than the normal spreadsheet extraction method to cover gaps in the // edge detection/pixel snapping steps horizontalRulings = Ruling.CollapseOrientedRulings(horizontalRulings, 5); verticalRulings = Ruling.CollapseOrientedRulings(verticalRulings, 5); // use the rulings and points to find cells List <TableRectangle> cells = SpreadsheetExtractionAlgorithm.FindCells(horizontalRulings, verticalRulings).Cast <TableRectangle>().ToList(); // then use those cells to make table areas tableAreas = getTableAreasFromCells(cells); } // next find any vertical rulings that intersect tables - sometimes these won't have completely been captured as // cells if there are missing horizontal lines (which there often are) // let's assume though that these lines should be part of the table foreach (Ruling verticalRuling in verticalRulings) // Line2D.Float { foreach (TableRectangle tableArea in tableAreas) { if (verticalRuling.Intersects(tableArea) && !(tableArea.Contains(verticalRuling.P1) && tableArea.Contains(verticalRuling.P2))) { tableArea.SetTop(Math.Ceiling(Math.Max(tableArea.Top, verticalRuling.Y2))); // bobld: Floor and Min, Y1 tableArea.SetBottom(Math.Floor(Math.Min(tableArea.Bottom, verticalRuling.Y1))); // bobld: Ceiling and Max, Y2 break; } } } /* BobLd: not sure this is the case in tabula-sharp/PdfPig * // the tabula Page coordinate space is half the size of the PDFBox image coordinate space * // so halve the table area size before proceeding and add a bit of padding to make sure we capture everything * foreach (TableRectangle area in tableAreas) * { * area.x = (float)Math.floor(area.x / 2) - TABLE_PADDING_AMOUNT; * area.y = (float)Math.floor(area.y / 2) - TABLE_PADDING_AMOUNT; * area.width = (float)Math.ceil(area.width / 2) + TABLE_PADDING_AMOUNT; * area.height = (float)Math.ceil(area.height / 2) + TABLE_PADDING_AMOUNT; * } * * // we're going to want halved horizontal lines later too * foreach (Ruling ruling in horizontalRulings) // Line2D.Float * { * ruling.x1 = ruling.x1 / 2; * ruling.y1 = ruling.y1 / 2; * ruling.x2 = ruling.x2 / 2; * ruling.y2 = ruling.y2 / 2; * } */ // now look at text rows to help us find more tables and flesh out existing ones List <TextChunk> textChunks = TextElement.MergeWords(page.GetText()); List <TableLine> lines = TextChunk.GroupByLines(textChunks); // first look for text rows that intersect an existing table - those lines should probably be part of the table foreach (TableLine textRow in lines) { foreach (TableRectangle tableArea in tableAreas) { if (!tableArea.Contains(textRow) && textRow.Intersects(tableArea)) { tableArea.SetLeft(Math.Floor(Math.Min(textRow.Left, tableArea.Left))); tableArea.SetRight(Math.Ceiling(Math.Max(textRow.Right, tableArea.Right))); } } } // get rid of tables that DO NOT intersect any text areas - these are likely graphs or some sort of graphic //for (Iterator<Rectangle> iterator = tableAreas.iterator(); iterator.hasNext();) foreach (TableRectangle table in tableAreas.ToList()) // use tolist to be able to remove { bool intersectsText = false; foreach (TableLine textRow in lines) { if (table.Intersects(textRow)) { intersectsText = true; break; } } if (!intersectsText) { tableAreas.Remove(table); } } // lastly, there may be some tables that don't have any vertical rulings at all // we'll use text edges we've found to try and guess which text rows are part of a table // in his thesis nurminen goes through every row to try to assign a probability that the line is in a table // we're going to try a general heuristic instead, trying to find what type of edge (left/right/mid) intersects // the most text rows, and then use that magic number of "relevant" edges to decide what text rows should be // part of a table. bool foundTable; do { foundTable = false; // get rid of any text lines contained within existing tables, this allows us to find more tables //for (Iterator<TableLine> iterator = lines.iterator(); iterator.hasNext();) foreach (var textRow in lines.ToList()) { foreach (TableRectangle table in tableAreas) { if (table.Contains(textRow)) { lines.Remove(textRow); break; } } } // get text edges from remaining lines in the document TextEdges textEdges = getTextEdges(lines); //List<TextEdge> leftTextEdges = textEdges[TextEdge.LEFT]; //List<TextEdge> midTextEdges = textEdges[TextEdge.MID]; //List<TextEdge> rightTextEdges = textEdges[TextEdge.RIGHT]; // find the relevant text edges (the ones we think define where a table is) RelevantEdges relevantEdgeInfo = getRelevantEdges(textEdges, lines); // we found something relevant so let's look for rows that fit our criteria if (relevantEdgeInfo.edgeType != -1) { List <TextEdge> relevantEdges = null; switch (relevantEdgeInfo.edgeType) { case TextEdge.LEFT: relevantEdges = textEdges[TextEdge.LEFT]; // leftTextEdges; break; case TextEdge.MID: relevantEdges = textEdges[TextEdge.MID]; // midTextEdges; break; case TextEdge.RIGHT: relevantEdges = textEdges[TextEdge.RIGHT]; // rightTextEdges; break; } TableRectangle table = getTableFromText(lines, relevantEdges, relevantEdgeInfo.edgeCount, horizontalRulings); if (table != null) { foundTable = true; tableAreas.Add(table); } } } while (foundTable); // create a set of our current tables that will eliminate duplicate tables SortedSet <TableRectangle> tableSet = new SortedSet <TableRectangle>(new TreeSetComparer()); //Set<Rectangle> tableSet = new TreeSet<>(new Comparator<Rectangle>() {... foreach (var table in tableAreas.OrderByDescending(t => t.Area)) { tableSet.Add(table); } return(tableSet.ToList()); }