public void TestExtractColumnsCorrectly2() { if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) // || RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) { PageArea page = UtilsForTesting.GetPage(EU_017_PDF, 3); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(page.VerticalRulings); Table table = bea.Extract(page.GetArea(new PdfRectangle(148.44, 543 - (711.875 - 299.625), 452.32, 543)))[0]; var result = UtilsForTesting.TableToArrayOfRows(table); Assert.Equal(EU_017_EXPECTED.Length, result.Length); for (int i = 0; i < EU_017_EXPECTED.Length; i++) { var expecteds = EU_017_EXPECTED[i]; var actuals = result[i]; Assert.Equal(expecteds.Length, actuals.Length); for (int j = 0; j < expecteds.Length; j++) { var e = expecteds[j]; var a = actuals[j]; Assert.Equal(e, a); } } } else { // fails on linux and mac os. Linked to PdfPig not finding the correct font. // need to use apt-get -y install ttf-mscorefonts-installer // still have mscorefonts - eula license could not be presented } }
public void Eu004() { using (PdfDocument document = PdfDocument.Open("Resources/icdar2013-dataset/competition-dataset-eu/eu-004.pdf", new ParsingOptions() { ClipPaths = true })) { ObjectExtractor oe = new ObjectExtractor(document); PageArea page = oe.Extract(3); var detector = new SimpleNurminenDetectionAlgorithm(); var regions = detector.Detect(page); var newArea = page.GetArea(regions[0].BoundingBox); var sea = new SpreadsheetExtractionAlgorithm(); var tables = sea.Extract(newArea); /* * var detector = new SimpleNurminenDetectionAlgorithm(); * var regions = detector.Detect(page); * * foreach (var a in regions) * { * IExtractionAlgorithm ea = new BasicExtractionAlgorithm(); * var newArea = page.GetArea(a.BoundingBox); * List<Table> tables = ea.Extract(newArea); * } */ } }
public void TestLinesToCells() { using (PdfDocument document = PdfDocument.Open("test3.pdf", new ParsingOptions() { ClipPaths = true })) { ObjectExtractor oe = new ObjectExtractor(document); PageArea page = oe.Extract(1); SimpleNurminenDetectionAlgorithm detector = new SimpleNurminenDetectionAlgorithm(); var regions = detector.Detect(page); foreach (var a in regions) { IExtractionAlgorithm ea = new BasicExtractionAlgorithm(); var newArea = page.GetArea(a.BoundingBox); List <Table> tables = ea.Extract(newArea); } } }
/// <summary> /// /// </summary> /// <param name="page"></param> public bool IsTabular(PageArea page) { // if there's no text at all on the page, it's not a table // (we won't be able to do anything with it though) if (page.GetText().Count == 0) { return(false); } // get minimal region of page that contains every character (in effect, // removes white "margins") PageArea minimalRegion = page.GetArea(Utils.Bounds(page.GetText().Select(t => t.BoundingBox).ToList())); List <Table> tables = new SpreadsheetExtractionAlgorithm().Extract(minimalRegion); if (tables.Count == 0) { return(false); } Table table = tables[0]; int rowsDefinedByLines = table.RowCount; int colsDefinedByLines = table.ColumnCount; tables = new BasicExtractionAlgorithm().Extract(minimalRegion); if (tables.Count == 0) { // TODO WHAT DO WE DO HERE? System.Diagnostics.Debug.Write("SpreadsheetExtractionAlgorithm.isTabular(): no table found."); } table = tables[0]; int rowsDefinedWithoutLines = table.RowCount; int colsDefinedWithoutLines = table.ColumnCount; float ratio = (((float)colsDefinedByLines / colsDefinedWithoutLines) + ((float)rowsDefinedByLines / rowsDefinedWithoutLines)) / 2.0f; return(ratio > MAGIC_HEURISTIC_NUMBER && ratio < (1 / MAGIC_HEURISTIC_NUMBER)); }