public void TestExtractColumnsCorrectly() { if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) // || RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) { PageArea page = UtilsForTesting.GetAreaFromPage(EU_002_PDF, 1, new PdfRectangle(70.0, 725 - (233 - 115), 510.0, 725)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; var actualArray = UtilsForTesting.TableToArrayOfRows(table); Assert.Equal(EU_002_EXPECTED.Length, actualArray.Length); for (int i = 0; i < EU_002_EXPECTED.Length; i++) { var expecteds = EU_002_EXPECTED[i]; var actuals = actualArray[i]; Assert.Equal(expecteds.Length, actuals.Length); for (int j = 0; j < expecteds.Length; j++) { var e = expecteds[j]; var a = actuals[j]; Assert.Equal(e, a); } } } else { // fails on linux and mac os. Linked to PdfPig not finding the correct font. // need to use apt-get -y install ttf-mscorefonts-installer // still have mscorefonts - eula license could not be presented } }
public void TestExtractColumnsCorrectly2() { if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) // || RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) { PageArea page = UtilsForTesting.GetPage(EU_017_PDF, 3); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(page.VerticalRulings); Table table = bea.Extract(page.GetArea(new PdfRectangle(148.44, 543 - (711.875 - 299.625), 452.32, 543)))[0]; var result = UtilsForTesting.TableToArrayOfRows(table); Assert.Equal(EU_017_EXPECTED.Length, result.Length); for (int i = 0; i < EU_017_EXPECTED.Length; i++) { var expecteds = EU_017_EXPECTED[i]; var actuals = result[i]; Assert.Equal(expecteds.Length, actuals.Length); for (int j = 0; j < expecteds.Length; j++) { var e = expecteds[j]; var a = actuals[j]; Assert.Equal(e, a); } } } else { // fails on linux and mac os. Linked to PdfPig not finding the correct font. // need to use apt-get -y install ttf-mscorefonts-installer // still have mscorefonts - eula license could not be presented } }
public void TestExtractSpreadsheetWithinAnArea() { PageArea page = UtilsForTesting.GetAreaFromPage("Resources/puertos1.pdf", 1, new PdfRectangle(30.32142857142857, 793 - 554.8821428571429, 546.7964285714286, 793 - 273.9035714285714)); // 273.9035714285714f, 30.32142857142857f, 554.8821428571429f, 546.7964285714286f); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); List <Table> tables = se.Extract(page); Table table = tables[0]; Assert.Equal(15, table.Rows.Count); const string expected = "\"\",TM,M.U$S,TM,M.U$S,TM,M.U$S,TM,M.U$S,TM,M.U$S,TM,M.U$S,TM\n" + "Peces vivos,1,25,1,23,2,38,1,37,2,67,2,89,1\n" + "\"Pescado fresco\n" + "o refrigerado.\n" + "exc. filetes\",7.704,7.175,8.931,6.892,12.635,10.255,16.742,13.688,14.357,11.674,13.035,13.429,9.727\n" + "\"Pescado congelado\n" + "exc. filetes\",90.560,105.950,112.645,108.416,132.895,115.874,152.767,133.765,148.882,134.847,156.619,165.134,137.179\n" + "\"Filetes y demás car-\n" + "nes de pescado\",105.434,200.563,151.142,218.389,152.174,227.780,178.123,291.863,169.422,313.735,176.427,381.640,144.814\n" + "\"Pescado sec./sal./\n" + "en salm. har./pol./\n" + "pell. aptos\n" + "p/c humano\",6.837,14.493,6.660,9.167,14.630,17.579,18.150,21.302,18.197,25.739,13.460,23.549,11.709\n" + "Crustáceos,61.691,375.798,52.488,251.043,47.635,387.783,27.815,217.443,7.123,86.019,39.488,373.583,45.191\n" + "Moluscos,162.027,174.507,109.436,111.443,90.834,104.741,57.695,109.141,98.182,206.304,187.023,251.352,157.531\n" + "\"Prod. no exp. en\n" + "otros capítulos.\n" + "No apto p/c humano\",203,328,7,35,521,343,\"1,710\",\"1,568\",125,246,124,263,131\n" + "\"Grasas y aceites de\n" + "pescado y mamíferos\n" + "marinos\",913,297,\"1,250\",476,\"1,031\",521,\"1,019\",642,690,483,489,710,959\n" + "\"Extractos y jugos de\n" + "pescado y mariscos\",5,25,1,3,4,4,31,93,39,117,77,230,80\n" + "\"Preparaciones y con-\n" + "servas de pescado\",846,\"3,737\",\"1,688\",\"4,411\",\"1,556\",\"3,681\",\"2,292\",\"5,474\",\"2,167\",\"7,494\",\"2,591\",\"8,833\",\"2,795\"\n" + "\"Preparaciones y con-\n" + "servas de mariscos\",348,\"3,667\",345,\"1,771\",738,\"3,627\",561,\"2,620\",607,\"3,928\",314,\"2,819\",250\n" + "\"Harina, polvo y pe-\n" + "llets de pescado.No\n" + "aptos p/c humano\",\"16,947\",\"8,547\",\"11,867\",\"6,315\",\"32,528\",\"13,985\",\"37,313\",\"18,989\",\"35,787\",\"19,914\",\"37,821\",\"27,174\",\"30,000\"\n" + "TOTAL,\"453,515\",\"895,111\",\"456,431\",\"718,382\",\"487,183\",\"886,211\",\"494,220\",\"816,623\",\"495,580\",\"810,565\",\"627,469\",\"1,248,804\",\"540,367\"\n"; // TODO add better assertions StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables[0]); string result = sb.ToString(); //List<CSVRecord> parsedExpected = org.apache.commons.csv.CSVParser.parse(expected, CSVFormat.EXCEL).getRecords(); //List<CSVRecord> parsedResult = org.apache.commons.csv.CSVParser.parse(result, CSVFormat.EXCEL).getRecords(); using (var csv = new CsvReader(new StreamReader(new MemoryStream(Encoding.ASCII.GetBytes(result))), CultureInfo.InvariantCulture)) { /* * Assert.Equal(parsedResult.Count, parsedExpected.Count); * for (int i = 0; i < parsedResult.Count; i++) * { * Assert.Equal(parsedResult[i].size(), parsedExpected[i].size()); * } */ } }
public void TestCSVSerializeInfinity() { string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/schools.csv"); // top, left, bottom, right // page height = 612 // 53.74f, 16.97f, 548.74f, 762.3f) PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/schools.pdf", new PdfRectangle(16.97, 612 - 548.74, 762.3, 612 - 53.74 - 1)); // remove 1 because add an empty line at the top if not SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); Table table = sea.Extract(page)[0]; StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, table); string s = sb.ToString(); Assert.Equal(expectedCsv.Trim(), s.Replace("\r\n", "\n")); /* * using (var stream = new MemoryStream()) * using (var sb = new StreamWriter(stream) { AutoFlush = true }) * { * (new CSVWriter()).write(sb, table); * var reader = new StreamReader(stream); * stream.Position = 0; * var s = reader.ReadToEnd().Trim(); // trim to remove last new line * Assert.Equal(expectedCsv, s); * } */ }
public void TestCSVWriter() { string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/argentina_diputados_voting_record.csv"); Table table = this.GetTable(); /* * StringBuilder sb = new StringBuilder(); * (new CSVWriter()).write(sb, table); * String s = sb.ToString(); * String[] lines = s.Split("\\r?\\n"); * assertEquals(EXPECTED_CSV_WRITER_OUTPUT, lines[0]); * assertEquals(expectedCsv, s); */ using (var stream = new MemoryStream()) using (var sb = new StreamWriter(stream) { AutoFlush = true }) { (new CSVWriter()).Write(sb, table); var reader = new StreamReader(stream); stream.Position = 0; var s = reader.ReadToEnd().Trim(); // trim to remove last new line string[] lines = s.Split("\r\n"); // "\\r?\\n" Assert.Equal(EXPECTED_CSV_WRITER_OUTPUT, lines[0]); Assert.Equal(expectedCsv, s.Replace("\r\n", "\n")); } }
public void TestRealLifeRTL() { PageArea page = UtilsForTesting.GetPage("Resources/mednine.pdf", 1); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); List <Table> tables = sea.Extract(page); Assert.Single(tables); Table table = tables[0]; var rows = table.Rows; Assert.Equal("الانتخابات التشريعية 2014", rows[0][0].GetText()); // the doubled spaces might be a bug in my implementation. // bobld: missing space or worng words order Assert.Equal("ورقة كشف نتائج دائرة مدنين", rows[1][0].GetText()); Assert.Equal("426", rows[4][0].GetText()); Assert.Equal("63", rows[4][1].GetText()); Assert.Equal("43", rows[4][2].GetText()); Assert.Equal("56", rows[4][3].GetText()); Assert.Equal("58", rows[4][4].GetText()); Assert.Equal("49", rows[4][5].GetText()); Assert.Equal("55", rows[4][6].GetText()); Assert.Equal("33", rows[4][7].GetText()); Assert.Equal("32", rows[4][8].GetText()); Assert.Equal("37", rows[4][9].GetText()); Assert.Equal("قائمة من أجل تحقيق سلطة الشعب", rows[4][10].GetText()); // there is one remaining problems that are not yet addressed // - diacritics (e.g. Arabic's tanwinً and probably Hebrew nekudot) are put in the wrong place. // this should get fixed, but this is a good first stab at the problem. // these (commented-out) tests reflect the theoretical correct answer, // which is not currently possible because of the two problems listed above //Assert.Equal("مرحباً", rows[0][0].getText()); // really ought to be ً, but this is forgiveable for now }
public void TestCSVMultilineRow() { string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/frx_2012_disclosure.csv"); PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/frx_2012_disclosure.pdf", new PdfRectangle(double.NaN, double.NaN, double.NaN, double.NaN)); // 53.0f, 49.0f, 735.0f, 550.0f); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); Table table = sea.Extract(page)[0]; StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, table); string s = sb.ToString(); Assert.Equal(expectedCsv, s); /* * using (var stream = new MemoryStream()) * using (var sb = new StreamWriter(stream) { AutoFlush = true }) * { * (new CSVWriter()).write(sb, table); * var reader = new StreamReader(stream); * stream.Position = 0; * var s = reader.ReadToEnd().Trim(); // trim to remove last new line * Assert.Equal(expectedCsv, s); * } */ }
private List <Table> GetTables() { PageArea page = UtilsForTesting.GetPage("Resources/twotables.pdf", 1); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); return(sea.Extract(page)); }
public void TestCSVSerializeTwoTables() { string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/twotables.csv"); List <Table> tables = this.GetTables(); /* * StringBuilder sb = new StringBuilder(); * (new CSVWriter()).write(sb, tables); * String s = sb.toString(); * assertEquals(expectedCsv, s); */ using (var stream = new MemoryStream()) using (var sb = new StreamWriter(stream) { AutoFlush = true }) { (new CSVWriter()).Write(sb, tables); var reader = new StreamReader(stream); stream.Position = 0; var s = reader.ReadToEnd().Trim(); // trim to remove last new line Assert.Equal(expectedCsv, s); } }
public void TestJSONSerializeTwoTables() { string expectedJson = UtilsForTesting.LoadJson("Resources/json/twotables.json"); List <Table> tables = this.GetTables(); StringBuilder sb = new StringBuilder(); (new JSONWriter()).Write(sb, tables); string s = sb.ToString(); Assert.Equal(expectedJson, s); /* * using (var stream = new MemoryStream()) * using (var sb = new StreamWriter(stream) { AutoFlush = true }) * { * (new JSONWriter()).write(sb, tables); * * var reader = new StreamReader(stream); * stream.Position = 0; * var s = reader.ReadToEnd(); * * //File.WriteAllText("twotables_new.json", s); * * Assert.Equal(expectedJson, s); * * // Gson gson = new Gson(); * //JsonArray json = gson.fromJson(s, JsonArray.class); * //assertEquals(2, json.size()); * var json = JsonConvert.DeserializeObject<List<Table>>(s); * Assert.Equal(2, json.Count); * } */ }
private Table GetTable() { PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/argentina_diputados_voting_record.pdf", new PdfRectangle(12.75, 55.0, 561, 567)); // 269.875f, 12.75f, 790.5f, 561f); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); return(bea.Extract(page)[0]); }
public void TestExtractTableWithExternallyDefinedRulings() { PageArea page = UtilsForTesting.GetPage("Resources/us-007.pdf", 1); SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm(); List <Table> tables = bea.Extract(page, EXTERNALLY_DEFINED_RULINGS.ToList()); Assert.Single(tables); Table table = tables[0]; Assert.Equal(18, table.Cells.Count); var rows = table.Rows; Assert.Equal("Payroll Period", rows[0][0].GetText()); Assert.Equal("One Withholding\rAllowance", rows[0][1].GetText()); Assert.Equal("Weekly", rows[1][0].GetText()); Assert.Equal("$71.15", rows[1][1].GetText()); Assert.Equal("Biweekly", rows[2][0].GetText()); Assert.Equal("142.31", rows[2][1].GetText()); Assert.Equal("Semimonthly", rows[3][0].GetText()); Assert.Equal("154.17", rows[3][1].GetText()); Assert.Equal("Monthly", rows[4][0].GetText()); Assert.Equal("308.33", rows[4][1].GetText()); Assert.Equal("Quarterly", rows[5][0].GetText()); Assert.Equal("925.00", rows[5][1].GetText()); Assert.Equal("Semiannually", rows[6][0].GetText()); Assert.Equal("1,850.00", rows[6][1].GetText()); Assert.Equal("Annually", rows[7][0].GetText()); Assert.Equal("3,700.00", rows[7][1].GetText()); Assert.Equal("Daily or Miscellaneous\r(each day of the payroll period)", rows[8][0].GetText()); Assert.Equal("14.23", rows[8][1].GetText()); }
public void TestRTL() { PageArea page = UtilsForTesting.GetPage("Resources/arabic.pdf", 1); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); List <Table> tables = sea.Extract(page); // Assert.Equal(1, tables.size()); Table table = tables[0]; var rows = table.Rows; Assert.Equal("اسمي سلطان", rows[1][1].GetText()); Assert.Equal("من اين انت؟", rows[2][1].GetText()); Assert.Equal("1234", rows[3][0].GetText()); Assert.Equal("هل انت شباك؟", rows[4][0].GetText()); Assert.Equal("انا من ولاية كارولينا الشمال", rows[2][0].GetText()); // conjoined lam-alif gets missed Assert.Equal("اسمي Jeremy في الانجليزية", rows[4][1].GetText()); // conjoined lam-alif gets missed Assert.Equal("عندي 47 قطط", rows[3][1].GetText()); // the real right answer is 47. Assert.Equal("Jeremy is جرمي in Arabic", rows[5][0].GetText()); // the real right answer is 47. Assert.Equal("مرحباً", rows[1][0].GetText()); // really ought to be ً, but this is forgiveable for now // there is one remaining problems that are not yet addressed // - diacritics (e.g. Arabic's tanwinً and probably Hebrew nekudot) are put in the wrong place. // this should get fixed, but this is a good first stab at the problem. // these (commented-out) tests reflect the theoretical correct answer, // which is not currently possible because of the two problems listed above // Assert.Equal("مرحباً", table.getRows()[0][0].getText()); // really ought to be ً, but this is forgiveable for now }
public void TestSpanningCells() { PageArea page = UtilsForTesting.GetPage("Resources/spanning_cells.pdf", 1); string expectedJson = UtilsForTesting.LoadJson("Resources/json/spanning_cells.json"); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); List <Table> tables = se.Extract(page); Assert.Equal(2, tables.Count); var expectedJObject = (JArray)JsonConvert.DeserializeObject(expectedJson); StringBuilder sb = new StringBuilder(); (new JSONWriter()).Write(sb, tables); var actualJObject = (JArray)JsonConvert.DeserializeObject(sb.ToString()); double pageHeight = 842; double precision = 2; for (int i = 0; i < 2; i++) { Assert.Equal(expectedJObject[i]["extraction_method"], actualJObject[i]["extraction_method"]); Assert.True(Math.Abs(Math.Floor(pageHeight - expectedJObject[i]["top"].Value <double>()) - Math.Floor(actualJObject[i]["top"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["left"].Value <double>()) - Math.Floor(actualJObject[i]["left"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["width"].Value <double>()) - Math.Floor(actualJObject[i]["width"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["height"].Value <double>()) - Math.Floor(actualJObject[i]["height"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["right"].Value <double>()) - Math.Floor(actualJObject[i]["right"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(pageHeight - expectedJObject[i]["bottom"].Value <double>()) - Math.Floor(actualJObject[i]["bottom"].Value <double>())) < precision); var expectedData = (JArray)expectedJObject[i]["data"]; var actualData = (JArray)actualJObject[i]["data"]; Assert.Equal(expectedData.Count, actualData.Count); for (int r = 0; r < expectedData.Count; r++) { var rowExpected = (JArray)expectedData[r]; var rowActual = (JArray)actualData[r]; Assert.Equal(rowExpected.Count, rowActual.Count); for (int c = 0; c < rowExpected.Count; c++) { var cellExpected = (JObject)rowExpected[c]; var cellActual = (JObject)rowActual[c]; if (string.IsNullOrEmpty(cellExpected["text"].Value <string>())) { continue; // empty cell have no coordinate data??? } Assert.True(Math.Abs(Math.Floor(pageHeight - cellExpected["top"].Value <double>()) - Math.Floor(cellActual["top"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(cellExpected["left"].Value <double>()) - Math.Floor(cellActual["left"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(cellExpected["width"].Value <double>()) - Math.Floor(cellActual["width"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(cellExpected["height"].Value <double>()) - Math.Floor(cellActual["height"].Value <double>())) < precision); Assert.Equal(cellExpected["text"].Value <string>(), cellActual["text"].Value <string>()); } } } //Assert.Equal(expectedJson, sb.ToString()); }
public void TestIncompleteGrid() { PageArea page = UtilsForTesting.GetPage("Resources/china.pdf", 1); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); List <Table> tables = se.Extract(page); Assert.Equal(2, tables.Count); }
public void TestShouldDetectASingleSpreadsheet() { PageArea page = UtilsForTesting.GetAreaFromPage("Resources/offense.pdf", 1, new PdfRectangle(16.44, 792 - 680.85, 597.84, 792 - 16.44)); // 68.08f, 16.44f, 680.85f, 597.84f); SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm(); List <Table> tables = bea.Extract(page); Assert.Single(tables); }
public void TestEmptyRegion() { PageArea page = UtilsForTesting.GetAreaFromPage("Resources/indictb1h_14.pdf", 1, new PdfRectangle(0, 700, 100.9, 800)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; Assert.Equal(EXPECTED_EMPTY_TABLE, UtilsForTesting.TableToArrayOfRows(table)); }
public void TestRemoveSequentialSpaces() { PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/m27.pdf", new PdfRectangle(28.28, 532 - (103.04 - 79.2), 732.6, 532)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; var firstRow = table.Rows[0]; Assert.Equal("ALLEGIANT AIR", firstRow[1].GetText()); Assert.Equal("ALLEGIANT AIR LLC", firstRow[2].GetText()); }
public void TestExtractColumnsCorrectly3() { // top, left, bottom, right // 106.01f, 48.09f, 227.31f, 551.89f // bottom = 792 - 227.31 = 564.69 // top = 792 - 106.01 = 685.99 PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/frx_2012_disclosure.pdf", new PdfRectangle(48.09, 564.69, 551.89, 684.99)); // changed 685.99 to 684.99 because was adding an empty row at the top SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); Table table = sea.Extract(page)[0]; Assert.Equal("REGIONAL PULMONARY & SLEEP\rMEDICINE", table.Rows[8][1].GetText()); }
public void TestDontStackOverflowQuicksort() { PageArea page = UtilsForTesting.GetPage("Resources/failing_sort.pdf", 1); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); List <Table> tables = sea.Extract(page); for (int i = 1; i < tables.Count; i++) { Assert.True(tables[i - 1].Top >= tables[i].Top); //Assert.True(tables[i - 1].getTop() <= tables[i].getTop()); } }
public void TestAnotherExtractTableWithExternallyDefinedRulings() { PageArea page = UtilsForTesting.GetPage("Resources/us-024.pdf", 1); SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm(); List <Table> tables = bea.Extract(page, EXTERNALLY_DEFINED_RULINGS2.ToList()); Assert.Single(tables); Table table = tables[0]; Assert.Equal("Total Supply", table.Rows[4][0].GetText()); Assert.Equal("6.6", table.Rows[6][2].GetText()); }
public void TestSpreadsheetsSortedByTopAndRight() { PageArea page = UtilsForTesting.GetPage("Resources/sydney_disclosure_contract.pdf", 1); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); List <Table> tables = sea.Extract(page); for (int i = 1; i < tables.Count; i++) { Assert.True(tables[i - 1].Top >= tables[i].Top); // Assert.True(tables[i - 1].getTop() <= tables[i].getTop()); } }
public void TestJSONSerializeInfinity() { string expectedJson = UtilsForTesting.LoadJson("Resources/json/schools.json"); PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/schools.pdf", new PdfRectangle(double.NaN, double.NaN, double.NaN, double.NaN)); // 53.74f, 16.97f, 548.74f, 762.3f); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); Table table = sea.Extract(page)[0]; //.get(0); StringBuilder sb = new StringBuilder(); (new JSONWriter()).Write(sb, table); string s = sb.ToString(); Assert.Equal(expectedJson, s); }
public void TestCheckSqueezeDoesntBreak() { PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/12s0324.pdf", new PdfRectangle(17.25, 342, 410.25, 560.5)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; var rows = table.Rows; var firstRow = rows[0]; var firstRowFirstCell = firstRow[0].GetText(); var lastRow = rows[rows.Count - 1]; var lastRowLastCell = lastRow[lastRow.Count - 1].GetText(); Assert.Equal("Violent crime . . . . . . . . . . . . . . . . . .", firstRowFirstCell); Assert.Equal("(X)", lastRowLastCell); }
public void TestSpanningCellsToCsv() { PageArea page = UtilsForTesting.GetPage("Resources/spanning_cells.pdf", 1); string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/spanning_cells.csv"); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); List <Table> tables = se.Extract(page); Assert.Equal(2, tables.Count); StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables); Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n").Trim()); }
public void StreamNoGuess1() { PageArea page = UtilsForTesting.GetPage("Resources/data.pdf", 1); // data_stream_noguess.csv was modified for decimal precision string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/data_stream_noguess.csv"); BasicExtractionAlgorithm se = new BasicExtractionAlgorithm(); List <Table> tables = se.Extract(page); StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables[0]); Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n")); }
public void TestNaturalOrderOfRectanglesDoesNotBreakContract() { PageArea page = UtilsForTesting.GetPage("Resources/us-017.pdf", 2); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); List <Table> tables = se.Extract(page); string expected = "Project,Agency,Institution\r\nNanotechnology and its publics,NSF,Pennsylvania State University\r\n\"Public information and deliberation in nanoscience and\rnanotechnology policy (SGER)\",Interagency,\"North Carolina State\rUniversity\"\r\n\"Social and ethical research and education in agrifood\rnanotechnology (NIRT)\",NSF,Michigan State University\r\n\"From laboratory to society: developing an informed\rapproach to nanoscale science and engineering (NIRT)\",NSF,University of South Carolina\r\nDatabase and innovation timeline for nanotechnology,NSF,UCLA\r\nSocial and ethical dimensions of nanotechnology,NSF,University of Virginia\r\n\"Undergraduate exploration of nanoscience,\rapplications and societal implications (NUE)\",NSF,\"Michigan Technological\rUniversity\"\r\n\"Ethics and belief inside the development of\rnanotechnology (CAREER)\",NSF,University of Virginia\r\n\"All centers, NNIN and NCN have a societal\rimplications components\",\"NSF, DOE,\rDOD, and NIH\",\"All nanotechnology centers\rand networks\""; // \r\n StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables[0]); string result = sb.ToString().Trim(); Assert.Equal(expected.Replace("\r\n", "\r"), result.Replace("\r\n", "\n").Replace("\n", "\r")); }
public void TestSpreadsheetWithNoBoundingFrameShouldBeSpreadsheet() { PageArea page = UtilsForTesting.GetAreaFromPage("Resources/spreadsheet_no_bounding_frame.pdf", 1, new PdfRectangle(58.9, 842 - 654.7, 536.12, 842 - 150.56)); // 842 - 150.56)); // 150.56f, 58.9f, 654.7f, 536.12f); string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/spreadsheet_no_bounding_frame.csv"); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); bool isTabular = se.IsTabular(page); Assert.True(isTabular); List <Table> tables = se.Extract(page); StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables[0]); Assert.Equal(expectedCsv, sb.ToString()); }
public void TestMergeLinesCloseToEachOther() { PageArea page = UtilsForTesting.GetPage("Resources/20.pdf", 1); IReadOnlyList <Ruling> rulings = page.VerticalRulings; Assert.Equal(6, rulings.Count); double[] expectedRulings = new double[] { 105.554812, 107.522417, 160.57705, 377.172662, 434.963828, 488.268507 }; var lefts = rulings.Select(x => x.Left).ToArray(); for (int i = 0; i < rulings.Count; i++) { Assert.Equal(expectedRulings[i], rulings[i].Left, 2); } }
public void StreamNoGuess1() { // tabula.read_pdf(pdf_path, stream=True, guess=False) PageArea page = UtilsForTesting.GetPage("Resources/data.pdf", 1); // string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/data_stream_noguess.csv"); BasicExtractionAlgorithm se = new BasicExtractionAlgorithm(); List <Table> tables = se.Extract(page); StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables[0]); Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n")); }