public void TestExtractSpreadsheetWithinAnArea() { PageArea page = UtilsForTesting.GetAreaFromPage("Resources/puertos1.pdf", 1, new PdfRectangle(30.32142857142857, 793 - 554.8821428571429, 546.7964285714286, 793 - 273.9035714285714)); // 273.9035714285714f, 30.32142857142857f, 554.8821428571429f, 546.7964285714286f); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); List <Table> tables = se.Extract(page); Table table = tables[0]; Assert.Equal(15, table.Rows.Count); const string expected = "\"\",TM,M.U$S,TM,M.U$S,TM,M.U$S,TM,M.U$S,TM,M.U$S,TM,M.U$S,TM\n" + "Peces vivos,1,25,1,23,2,38,1,37,2,67,2,89,1\n" + "\"Pescado fresco\n" + "o refrigerado.\n" + "exc. filetes\",7.704,7.175,8.931,6.892,12.635,10.255,16.742,13.688,14.357,11.674,13.035,13.429,9.727\n" + "\"Pescado congelado\n" + "exc. filetes\",90.560,105.950,112.645,108.416,132.895,115.874,152.767,133.765,148.882,134.847,156.619,165.134,137.179\n" + "\"Filetes y demás car-\n" + "nes de pescado\",105.434,200.563,151.142,218.389,152.174,227.780,178.123,291.863,169.422,313.735,176.427,381.640,144.814\n" + "\"Pescado sec./sal./\n" + "en salm. har./pol./\n" + "pell. aptos\n" + "p/c humano\",6.837,14.493,6.660,9.167,14.630,17.579,18.150,21.302,18.197,25.739,13.460,23.549,11.709\n" + "Crustáceos,61.691,375.798,52.488,251.043,47.635,387.783,27.815,217.443,7.123,86.019,39.488,373.583,45.191\n" + "Moluscos,162.027,174.507,109.436,111.443,90.834,104.741,57.695,109.141,98.182,206.304,187.023,251.352,157.531\n" + "\"Prod. no exp. en\n" + "otros capítulos.\n" + "No apto p/c humano\",203,328,7,35,521,343,\"1,710\",\"1,568\",125,246,124,263,131\n" + "\"Grasas y aceites de\n" + "pescado y mamíferos\n" + "marinos\",913,297,\"1,250\",476,\"1,031\",521,\"1,019\",642,690,483,489,710,959\n" + "\"Extractos y jugos de\n" + "pescado y mariscos\",5,25,1,3,4,4,31,93,39,117,77,230,80\n" + "\"Preparaciones y con-\n" + "servas de pescado\",846,\"3,737\",\"1,688\",\"4,411\",\"1,556\",\"3,681\",\"2,292\",\"5,474\",\"2,167\",\"7,494\",\"2,591\",\"8,833\",\"2,795\"\n" + "\"Preparaciones y con-\n" + "servas de mariscos\",348,\"3,667\",345,\"1,771\",738,\"3,627\",561,\"2,620\",607,\"3,928\",314,\"2,819\",250\n" + "\"Harina, polvo y pe-\n" + "llets de pescado.No\n" + "aptos p/c humano\",\"16,947\",\"8,547\",\"11,867\",\"6,315\",\"32,528\",\"13,985\",\"37,313\",\"18,989\",\"35,787\",\"19,914\",\"37,821\",\"27,174\",\"30,000\"\n" + "TOTAL,\"453,515\",\"895,111\",\"456,431\",\"718,382\",\"487,183\",\"886,211\",\"494,220\",\"816,623\",\"495,580\",\"810,565\",\"627,469\",\"1,248,804\",\"540,367\"\n"; // TODO add better assertions StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables[0]); string result = sb.ToString(); //List<CSVRecord> parsedExpected = org.apache.commons.csv.CSVParser.parse(expected, CSVFormat.EXCEL).getRecords(); //List<CSVRecord> parsedResult = org.apache.commons.csv.CSVParser.parse(result, CSVFormat.EXCEL).getRecords(); using (var csv = new CsvReader(new StreamReader(new MemoryStream(Encoding.ASCII.GetBytes(result))), CultureInfo.InvariantCulture)) { /* * Assert.Equal(parsedResult.Count, parsedExpected.Count); * for (int i = 0; i < parsedResult.Count; i++) * { * Assert.Equal(parsedResult[i].size(), parsedExpected[i].size()); * } */ } }
public void TestExtractColumnsCorrectly() { if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) // || RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) { PageArea page = UtilsForTesting.GetAreaFromPage(EU_002_PDF, 1, new PdfRectangle(70.0, 725 - (233 - 115), 510.0, 725)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; var actualArray = UtilsForTesting.TableToArrayOfRows(table); Assert.Equal(EU_002_EXPECTED.Length, actualArray.Length); for (int i = 0; i < EU_002_EXPECTED.Length; i++) { var expecteds = EU_002_EXPECTED[i]; var actuals = actualArray[i]; Assert.Equal(expecteds.Length, actuals.Length); for (int j = 0; j < expecteds.Length; j++) { var e = expecteds[j]; var a = actuals[j]; Assert.Equal(e, a); } } } else { // fails on linux and mac os. Linked to PdfPig not finding the correct font. // need to use apt-get -y install ttf-mscorefonts-installer // still have mscorefonts - eula license could not be presented } }
public void TestShouldDetectASingleSpreadsheet() { PageArea page = UtilsForTesting.GetAreaFromPage("Resources/offense.pdf", 1, new PdfRectangle(16.44, 792 - 680.85, 597.84, 792 - 16.44)); // 68.08f, 16.44f, 680.85f, 597.84f); SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm(); List <Table> tables = bea.Extract(page); Assert.Single(tables); }
public void TestEmptyRegion() { PageArea page = UtilsForTesting.GetAreaFromPage("Resources/indictb1h_14.pdf", 1, new PdfRectangle(0, 700, 100.9, 800)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; Assert.Equal(EXPECTED_EMPTY_TABLE, UtilsForTesting.TableToArrayOfRows(table)); }
public void TestSpreadsheetWithNoBoundingFrameShouldBeSpreadsheet() { PageArea page = UtilsForTesting.GetAreaFromPage("Resources/spreadsheet_no_bounding_frame.pdf", 1, new PdfRectangle(58.9, 842 - 654.7, 536.12, 842 - 150.56)); // 842 - 150.56)); // 150.56f, 58.9f, 654.7f, 536.12f); string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/spreadsheet_no_bounding_frame.csv"); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); bool isTabular = se.IsTabular(page); Assert.True(isTabular); List <Table> tables = se.Extract(page); StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables[0]); Assert.Equal(expectedCsv, sb.ToString()); }
public void TestTableWithMultilineHeader() { string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/us-020.csv"); PageArea page = UtilsForTesting.GetAreaFromPage("Resources/us-020.pdf", 2, new PdfRectangle(35.0, 151, 560, 688.5)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; using (var stream = new MemoryStream()) using (var sb = new StreamWriter(stream) { AutoFlush = true }) { (new CSVWriter()).Write(sb, table); var reader = new StreamReader(stream); stream.Position = 0; var data = reader.ReadToEnd().Replace("\r\n", "\n").Trim(); // trim to remove last new line Assert.Equal(expectedCsv, data); } }
public void TestRealLifeRTL2() { string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/indictb1h_14.csv"); PageArea page = UtilsForTesting.GetAreaFromPage("Resources/indictb1h_14.pdf", 1, new PdfRectangle(120.0, 842 - 622.82, 459.9, 842 - 120.0)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; using (var stream = new MemoryStream()) using (var sb = new StreamWriter(stream) { AutoFlush = true }) { (new CSVWriter()).Write(sb, table); var reader = new StreamReader(stream); stream.Position = 0; var data = reader.ReadToEnd().Replace("\r\n", "\n").Trim(); // trim to remove last new line Assert.Equal(expectedCsv, data); } }