public void TestExtractTableWithExternallyDefinedRulings() { PageArea page = UtilsForTesting.GetPage("Resources/us-007.pdf", 1); SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm(); List <Table> tables = bea.Extract(page, EXTERNALLY_DEFINED_RULINGS.ToList()); Assert.Single(tables); Table table = tables[0]; Assert.Equal(18, table.Cells.Count); var rows = table.Rows; Assert.Equal("Payroll Period", rows[0][0].GetText()); Assert.Equal("One Withholding\rAllowance", rows[0][1].GetText()); Assert.Equal("Weekly", rows[1][0].GetText()); Assert.Equal("$71.15", rows[1][1].GetText()); Assert.Equal("Biweekly", rows[2][0].GetText()); Assert.Equal("142.31", rows[2][1].GetText()); Assert.Equal("Semimonthly", rows[3][0].GetText()); Assert.Equal("154.17", rows[3][1].GetText()); Assert.Equal("Monthly", rows[4][0].GetText()); Assert.Equal("308.33", rows[4][1].GetText()); Assert.Equal("Quarterly", rows[5][0].GetText()); Assert.Equal("925.00", rows[5][1].GetText()); Assert.Equal("Semiannually", rows[6][0].GetText()); Assert.Equal("1,850.00", rows[6][1].GetText()); Assert.Equal("Annually", rows[7][0].GetText()); Assert.Equal("3,700.00", rows[7][1].GetText()); Assert.Equal("Daily or Miscellaneous\r(each day of the payroll period)", rows[8][0].GetText()); Assert.Equal("14.23", rows[8][1].GetText()); }
public void TestExtractSpreadsheetWithinAnArea() { PageArea page = UtilsForTesting.GetAreaFromPage("Resources/puertos1.pdf", 1, new PdfRectangle(30.32142857142857, 793 - 554.8821428571429, 546.7964285714286, 793 - 273.9035714285714)); // 273.9035714285714f, 30.32142857142857f, 554.8821428571429f, 546.7964285714286f); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); List <Table> tables = se.Extract(page); Table table = tables[0]; Assert.Equal(15, table.Rows.Count); const string expected = "\"\",TM,M.U$S,TM,M.U$S,TM,M.U$S,TM,M.U$S,TM,M.U$S,TM,M.U$S,TM\n" + "Peces vivos,1,25,1,23,2,38,1,37,2,67,2,89,1\n" + "\"Pescado fresco\n" + "o refrigerado.\n" + "exc. filetes\",7.704,7.175,8.931,6.892,12.635,10.255,16.742,13.688,14.357,11.674,13.035,13.429,9.727\n" + "\"Pescado congelado\n" + "exc. filetes\",90.560,105.950,112.645,108.416,132.895,115.874,152.767,133.765,148.882,134.847,156.619,165.134,137.179\n" + "\"Filetes y demás car-\n" + "nes de pescado\",105.434,200.563,151.142,218.389,152.174,227.780,178.123,291.863,169.422,313.735,176.427,381.640,144.814\n" + "\"Pescado sec./sal./\n" + "en salm. har./pol./\n" + "pell. aptos\n" + "p/c humano\",6.837,14.493,6.660,9.167,14.630,17.579,18.150,21.302,18.197,25.739,13.460,23.549,11.709\n" + "Crustáceos,61.691,375.798,52.488,251.043,47.635,387.783,27.815,217.443,7.123,86.019,39.488,373.583,45.191\n" + "Moluscos,162.027,174.507,109.436,111.443,90.834,104.741,57.695,109.141,98.182,206.304,187.023,251.352,157.531\n" + "\"Prod. no exp. en\n" + "otros capítulos.\n" + "No apto p/c humano\",203,328,7,35,521,343,\"1,710\",\"1,568\",125,246,124,263,131\n" + "\"Grasas y aceites de\n" + "pescado y mamíferos\n" + "marinos\",913,297,\"1,250\",476,\"1,031\",521,\"1,019\",642,690,483,489,710,959\n" + "\"Extractos y jugos de\n" + "pescado y mariscos\",5,25,1,3,4,4,31,93,39,117,77,230,80\n" + "\"Preparaciones y con-\n" + "servas de pescado\",846,\"3,737\",\"1,688\",\"4,411\",\"1,556\",\"3,681\",\"2,292\",\"5,474\",\"2,167\",\"7,494\",\"2,591\",\"8,833\",\"2,795\"\n" + "\"Preparaciones y con-\n" + "servas de mariscos\",348,\"3,667\",345,\"1,771\",738,\"3,627\",561,\"2,620\",607,\"3,928\",314,\"2,819\",250\n" + "\"Harina, polvo y pe-\n" + "llets de pescado.No\n" + "aptos p/c humano\",\"16,947\",\"8,547\",\"11,867\",\"6,315\",\"32,528\",\"13,985\",\"37,313\",\"18,989\",\"35,787\",\"19,914\",\"37,821\",\"27,174\",\"30,000\"\n" + "TOTAL,\"453,515\",\"895,111\",\"456,431\",\"718,382\",\"487,183\",\"886,211\",\"494,220\",\"816,623\",\"495,580\",\"810,565\",\"627,469\",\"1,248,804\",\"540,367\"\n"; // TODO add better assertions StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables[0]); string result = sb.ToString(); //List<CSVRecord> parsedExpected = org.apache.commons.csv.CSVParser.parse(expected, CSVFormat.EXCEL).getRecords(); //List<CSVRecord> parsedResult = org.apache.commons.csv.CSVParser.parse(result, CSVFormat.EXCEL).getRecords(); using (var csv = new CsvReader(new StreamReader(new MemoryStream(Encoding.ASCII.GetBytes(result))), CultureInfo.InvariantCulture)) { /* * Assert.Equal(parsedResult.Count, parsedExpected.Count); * for (int i = 0; i < parsedResult.Count; i++) * { * Assert.Equal(parsedResult[i].size(), parsedExpected[i].size()); * } */ } }
public void TestRealLifeRTL() { PageArea page = UtilsForTesting.GetPage("Resources/mednine.pdf", 1); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); List <Table> tables = sea.Extract(page); Assert.Single(tables); Table table = tables[0]; var rows = table.Rows; Assert.Equal("الانتخابات التشريعية 2014", rows[0][0].GetText()); // the doubled spaces might be a bug in my implementation. // bobld: missing space or worng words order Assert.Equal("ورقة كشف نتائج دائرة مدنين", rows[1][0].GetText()); Assert.Equal("426", rows[4][0].GetText()); Assert.Equal("63", rows[4][1].GetText()); Assert.Equal("43", rows[4][2].GetText()); Assert.Equal("56", rows[4][3].GetText()); Assert.Equal("58", rows[4][4].GetText()); Assert.Equal("49", rows[4][5].GetText()); Assert.Equal("55", rows[4][6].GetText()); Assert.Equal("33", rows[4][7].GetText()); Assert.Equal("32", rows[4][8].GetText()); Assert.Equal("37", rows[4][9].GetText()); Assert.Equal("قائمة من أجل تحقيق سلطة الشعب", rows[4][10].GetText()); // there is one remaining problems that are not yet addressed // - diacritics (e.g. Arabic's tanwinً and probably Hebrew nekudot) are put in the wrong place. // this should get fixed, but this is a good first stab at the problem. // these (commented-out) tests reflect the theoretical correct answer, // which is not currently possible because of the two problems listed above //Assert.Equal("مرحباً", rows[0][0].getText()); // really ought to be ً, but this is forgiveable for now }
public void TestCSVSerializeInfinity() { string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/schools.csv"); // top, left, bottom, right // page height = 612 // 53.74f, 16.97f, 548.74f, 762.3f) PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/schools.pdf", new PdfRectangle(16.97, 612 - 548.74, 762.3, 612 - 53.74 - 1)); // remove 1 because add an empty line at the top if not SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); Table table = sea.Extract(page)[0]; StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, table); string s = sb.ToString(); Assert.Equal(expectedCsv.Trim(), s.Replace("\r\n", "\n")); /* * using (var stream = new MemoryStream()) * using (var sb = new StreamWriter(stream) { AutoFlush = true }) * { * (new CSVWriter()).write(sb, table); * var reader = new StreamReader(stream); * stream.Position = 0; * var s = reader.ReadToEnd().Trim(); // trim to remove last new line * Assert.Equal(expectedCsv, s); * } */ }
public void Eu004() { using (PdfDocument document = PdfDocument.Open("Resources/icdar2013-dataset/competition-dataset-eu/eu-004.pdf", new ParsingOptions() { ClipPaths = true })) { ObjectExtractor oe = new ObjectExtractor(document); PageArea page = oe.Extract(3); var detector = new SimpleNurminenDetectionAlgorithm(); var regions = detector.Detect(page); var newArea = page.GetArea(regions[0].BoundingBox); var sea = new SpreadsheetExtractionAlgorithm(); var tables = sea.Extract(newArea); /* * var detector = new SimpleNurminenDetectionAlgorithm(); * var regions = detector.Detect(page); * * foreach (var a in regions) * { * IExtractionAlgorithm ea = new BasicExtractionAlgorithm(); * var newArea = page.GetArea(a.BoundingBox); * List<Table> tables = ea.Extract(newArea); * } */ } }
private List <Table> GetTables() { PageArea page = UtilsForTesting.GetPage("Resources/twotables.pdf", 1); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); return(sea.Extract(page)); }
public void TestCSVMultilineRow() { string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/frx_2012_disclosure.csv"); PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/frx_2012_disclosure.pdf", new PdfRectangle(double.NaN, double.NaN, double.NaN, double.NaN)); // 53.0f, 49.0f, 735.0f, 550.0f); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); Table table = sea.Extract(page)[0]; StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, table); string s = sb.ToString(); Assert.Equal(expectedCsv, s); /* * using (var stream = new MemoryStream()) * using (var sb = new StreamWriter(stream) { AutoFlush = true }) * { * (new CSVWriter()).write(sb, table); * var reader = new StreamReader(stream); * stream.Position = 0; * var s = reader.ReadToEnd().Trim(); // trim to remove last new line * Assert.Equal(expectedCsv, s); * } */ }
public void TestRTL() { PageArea page = UtilsForTesting.GetPage("Resources/arabic.pdf", 1); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); List <Table> tables = sea.Extract(page); // Assert.Equal(1, tables.size()); Table table = tables[0]; var rows = table.Rows; Assert.Equal("اسمي سلطان", rows[1][1].GetText()); Assert.Equal("من اين انت؟", rows[2][1].GetText()); Assert.Equal("1234", rows[3][0].GetText()); Assert.Equal("هل انت شباك؟", rows[4][0].GetText()); Assert.Equal("انا من ولاية كارولينا الشمال", rows[2][0].GetText()); // conjoined lam-alif gets missed Assert.Equal("اسمي Jeremy في الانجليزية", rows[4][1].GetText()); // conjoined lam-alif gets missed Assert.Equal("عندي 47 قطط", rows[3][1].GetText()); // the real right answer is 47. Assert.Equal("Jeremy is جرمي in Arabic", rows[5][0].GetText()); // the real right answer is 47. Assert.Equal("مرحباً", rows[1][0].GetText()); // really ought to be ً, but this is forgiveable for now // there is one remaining problems that are not yet addressed // - diacritics (e.g. Arabic's tanwinً and probably Hebrew nekudot) are put in the wrong place. // this should get fixed, but this is a good first stab at the problem. // these (commented-out) tests reflect the theoretical correct answer, // which is not currently possible because of the two problems listed above // Assert.Equal("مرحباً", table.getRows()[0][0].getText()); // really ought to be ً, but this is forgiveable for now }
public void TestSpanningCells() { PageArea page = UtilsForTesting.GetPage("Resources/spanning_cells.pdf", 1); string expectedJson = UtilsForTesting.LoadJson("Resources/json/spanning_cells.json"); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); List <Table> tables = se.Extract(page); Assert.Equal(2, tables.Count); var expectedJObject = (JArray)JsonConvert.DeserializeObject(expectedJson); StringBuilder sb = new StringBuilder(); (new JSONWriter()).Write(sb, tables); var actualJObject = (JArray)JsonConvert.DeserializeObject(sb.ToString()); double pageHeight = 842; double precision = 2; for (int i = 0; i < 2; i++) { Assert.Equal(expectedJObject[i]["extraction_method"], actualJObject[i]["extraction_method"]); Assert.True(Math.Abs(Math.Floor(pageHeight - expectedJObject[i]["top"].Value <double>()) - Math.Floor(actualJObject[i]["top"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["left"].Value <double>()) - Math.Floor(actualJObject[i]["left"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["width"].Value <double>()) - Math.Floor(actualJObject[i]["width"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["height"].Value <double>()) - Math.Floor(actualJObject[i]["height"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["right"].Value <double>()) - Math.Floor(actualJObject[i]["right"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(pageHeight - expectedJObject[i]["bottom"].Value <double>()) - Math.Floor(actualJObject[i]["bottom"].Value <double>())) < precision); var expectedData = (JArray)expectedJObject[i]["data"]; var actualData = (JArray)actualJObject[i]["data"]; Assert.Equal(expectedData.Count, actualData.Count); for (int r = 0; r < expectedData.Count; r++) { var rowExpected = (JArray)expectedData[r]; var rowActual = (JArray)actualData[r]; Assert.Equal(rowExpected.Count, rowActual.Count); for (int c = 0; c < rowExpected.Count; c++) { var cellExpected = (JObject)rowExpected[c]; var cellActual = (JObject)rowActual[c]; if (string.IsNullOrEmpty(cellExpected["text"].Value <string>())) { continue; // empty cell have no coordinate data??? } Assert.True(Math.Abs(Math.Floor(pageHeight - cellExpected["top"].Value <double>()) - Math.Floor(cellActual["top"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(cellExpected["left"].Value <double>()) - Math.Floor(cellActual["left"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(cellExpected["width"].Value <double>()) - Math.Floor(cellActual["width"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(cellExpected["height"].Value <double>()) - Math.Floor(cellActual["height"].Value <double>())) < precision); Assert.Equal(cellExpected["text"].Value <string>(), cellActual["text"].Value <string>()); } } } //Assert.Equal(expectedJson, sb.ToString()); }
public void TestShouldDetectASingleSpreadsheet() { PageArea page = UtilsForTesting.GetAreaFromPage("Resources/offense.pdf", 1, new PdfRectangle(16.44, 792 - 680.85, 597.84, 792 - 16.44)); // 68.08f, 16.44f, 680.85f, 597.84f); SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm(); List <Table> tables = bea.Extract(page); Assert.Single(tables); }
public void TestIncompleteGrid() { PageArea page = UtilsForTesting.GetPage("Resources/china.pdf", 1); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); List <Table> tables = se.Extract(page); Assert.Equal(2, tables.Count); }
public void TestSpreadsheetsSortedByTopAndRight() { PageArea page = UtilsForTesting.GetPage("Resources/sydney_disclosure_contract.pdf", 1); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); List <Table> tables = sea.Extract(page); for (int i = 1; i < tables.Count; i++) { Assert.True(tables[i - 1].Top >= tables[i].Top); // Assert.True(tables[i - 1].getTop() <= tables[i].getTop()); } }
public void TestAnotherExtractTableWithExternallyDefinedRulings() { PageArea page = UtilsForTesting.GetPage("Resources/us-024.pdf", 1); SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm(); List <Table> tables = bea.Extract(page, EXTERNALLY_DEFINED_RULINGS2.ToList()); Assert.Single(tables); Table table = tables[0]; Assert.Equal("Total Supply", table.Rows[4][0].GetText()); Assert.Equal("6.6", table.Rows[6][2].GetText()); }
public void TestExtractColumnsCorrectly3() { // top, left, bottom, right // 106.01f, 48.09f, 227.31f, 551.89f // bottom = 792 - 227.31 = 564.69 // top = 792 - 106.01 = 685.99 PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/frx_2012_disclosure.pdf", new PdfRectangle(48.09, 564.69, 551.89, 684.99)); // changed 685.99 to 684.99 because was adding an empty row at the top SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); Table table = sea.Extract(page)[0]; Assert.Equal("REGIONAL PULMONARY & SLEEP\rMEDICINE", table.Rows[8][1].GetText()); }
public void TestDontStackOverflowQuicksort() { PageArea page = UtilsForTesting.GetPage("Resources/failing_sort.pdf", 1); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); List <Table> tables = sea.Extract(page); for (int i = 1; i < tables.Count; i++) { Assert.True(tables[i - 1].Top >= tables[i].Top); //Assert.True(tables[i - 1].getTop() <= tables[i].getTop()); } }
public void TestSpanningCellsToCsv() { PageArea page = UtilsForTesting.GetPage("Resources/spanning_cells.pdf", 1); string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/spanning_cells.csv"); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); List <Table> tables = se.Extract(page); Assert.Equal(2, tables.Count); StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables); Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n").Trim()); }
public void TestJSONSerializeInfinity() { string expectedJson = UtilsForTesting.LoadJson("Resources/json/schools.json"); PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/schools.pdf", new PdfRectangle(double.NaN, double.NaN, double.NaN, double.NaN)); // 53.74f, 16.97f, 548.74f, 762.3f); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); Table table = sea.Extract(page)[0]; //.get(0); StringBuilder sb = new StringBuilder(); (new JSONWriter()).Write(sb, table); string s = sb.ToString(); Assert.Equal(expectedJson, s); }
public void TestNaturalOrderOfRectanglesDoesNotBreakContract() { PageArea page = UtilsForTesting.GetPage("Resources/us-017.pdf", 2); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); List <Table> tables = se.Extract(page); string expected = "Project,Agency,Institution\r\nNanotechnology and its publics,NSF,Pennsylvania State University\r\n\"Public information and deliberation in nanoscience and\rnanotechnology policy (SGER)\",Interagency,\"North Carolina State\rUniversity\"\r\n\"Social and ethical research and education in agrifood\rnanotechnology (NIRT)\",NSF,Michigan State University\r\n\"From laboratory to society: developing an informed\rapproach to nanoscale science and engineering (NIRT)\",NSF,University of South Carolina\r\nDatabase and innovation timeline for nanotechnology,NSF,UCLA\r\nSocial and ethical dimensions of nanotechnology,NSF,University of Virginia\r\n\"Undergraduate exploration of nanoscience,\rapplications and societal implications (NUE)\",NSF,\"Michigan Technological\rUniversity\"\r\n\"Ethics and belief inside the development of\rnanotechnology (CAREER)\",NSF,University of Virginia\r\n\"All centers, NNIN and NCN have a societal\rimplications components\",\"NSF, DOE,\rDOD, and NIH\",\"All nanotechnology centers\rand networks\""; // \r\n StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables[0]); string result = sb.ToString().Trim(); Assert.Equal(expected.Replace("\r\n", "\r"), result.Replace("\r\n", "\n").Replace("\n", "\r")); }
public void TestSpreadsheetWithNoBoundingFrameShouldBeSpreadsheet() { PageArea page = UtilsForTesting.GetAreaFromPage("Resources/spreadsheet_no_bounding_frame.pdf", 1, new PdfRectangle(58.9, 842 - 654.7, 536.12, 842 - 150.56)); // 842 - 150.56)); // 150.56f, 58.9f, 654.7f, 536.12f); string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/spreadsheet_no_bounding_frame.csv"); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); bool isTabular = se.IsTabular(page); Assert.True(isTabular); List <Table> tables = se.Extract(page); StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables[0]); Assert.Equal(expectedCsv, sb.ToString()); }
public void Latice1() { PageArea page = UtilsForTesting.GetPage("Resources/data.pdf", 1); // data_lattice.csv was modified to add the last row, missing in tabula_py string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/data_lattice.csv"); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); List <Table> tables = se.Extract(page); Assert.Single(tables); StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables[0]); Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n")); }
public void TestSpreadsheetExtractionIssue656() { // page height = 482, width 762.3 // 612 // top, left, bottom, right // 56.925f, 24.255f, 549.945f, 786.555f); PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/Publication_of_award_of_Bids_for_Transport_Sector__August_2016.pdf", new PdfRectangle(24.255, 71, 786.555, 553)); string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/Publication_of_award_of_Bids_for_Transport_Sector__August_2016.csv"); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); List <Table> tables = sea.Extract(page); Assert.Single(tables); Table table = tables[0]; StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, table); string result = sb.ToString(); Assert.Equal(expectedCsv.Replace("\n", "\r"), result.Replace("\r\n", "\n").Replace("\n", "\r").Trim()); /* * using (var stream = new MemoryStream()) * using (var sb = new StreamWriter(stream) { AutoFlush = true }) * { * (new CSVWriter()).write(sb, table); * * var reader = new StreamReader(stream); * stream.Position = 0; * var s = reader.ReadToEnd().Trim(); // trim to remove last new line * * // is there an issue with \r and \n? * Assert.Equal(expectedCsv.Replace("\n", "\r"), s.Replace("\r\n", "\n").Replace("\n", "\r")); * } */ }