public void TestCSVMultilineRow() { string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/frx_2012_disclosure.csv"); PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/frx_2012_disclosure.pdf", new PdfRectangle(double.NaN, double.NaN, double.NaN, double.NaN)); // 53.0f, 49.0f, 735.0f, 550.0f); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); Table table = sea.Extract(page)[0]; StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, table); string s = sb.ToString(); Assert.Equal(expectedCsv, s); /* * using (var stream = new MemoryStream()) * using (var sb = new StreamWriter(stream) { AutoFlush = true }) * { * (new CSVWriter()).write(sb, table); * var reader = new StreamReader(stream); * stream.Position = 0; * var s = reader.ReadToEnd().Trim(); // trim to remove last new line * Assert.Equal(expectedCsv, s); * } */ }
private Table GetTable() { PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/argentina_diputados_voting_record.pdf", new PdfRectangle(12.75, 55.0, 561, 567)); // 269.875f, 12.75f, 790.5f, 561f); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); return(bea.Extract(page)[0]); }
public void TestCSVSerializeInfinity() { string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/schools.csv"); // top, left, bottom, right // page height = 612 // 53.74f, 16.97f, 548.74f, 762.3f) PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/schools.pdf", new PdfRectangle(16.97, 612 - 548.74, 762.3, 612 - 53.74 - 1)); // remove 1 because add an empty line at the top if not SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); Table table = sea.Extract(page)[0]; StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, table); string s = sb.ToString(); Assert.Equal(expectedCsv.Trim(), s.Replace("\r\n", "\n")); /* * using (var stream = new MemoryStream()) * using (var sb = new StreamWriter(stream) { AutoFlush = true }) * { * (new CSVWriter()).write(sb, table); * var reader = new StreamReader(stream); * stream.Position = 0; * var s = reader.ReadToEnd().Trim(); // trim to remove last new line * Assert.Equal(expectedCsv, s); * } */ }
public void TestRemoveSequentialSpaces() { PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/m27.pdf", new PdfRectangle(28.28, 532 - (103.04 - 79.2), 732.6, 532)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; var firstRow = table.Rows[0]; Assert.Equal("ALLEGIANT AIR", firstRow[1].GetText()); Assert.Equal("ALLEGIANT AIR LLC", firstRow[2].GetText()); }
public void TestExtractColumnsCorrectly3() { // top, left, bottom, right // 106.01f, 48.09f, 227.31f, 551.89f // bottom = 792 - 227.31 = 564.69 // top = 792 - 106.01 = 685.99 PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/frx_2012_disclosure.pdf", new PdfRectangle(48.09, 564.69, 551.89, 684.99)); // changed 685.99 to 684.99 because was adding an empty row at the top SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); Table table = sea.Extract(page)[0]; Assert.Equal("REGIONAL PULMONARY & SLEEP\rMEDICINE", table.Rows[8][1].GetText()); }
public void TestCheckSqueezeDoesntBreak() { PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/12s0324.pdf", new PdfRectangle(17.25, 342, 410.25, 560.5)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; var rows = table.Rows; var firstRow = rows[0]; var firstRowFirstCell = firstRow[0].GetText(); var lastRow = rows[rows.Count - 1]; var lastRowLastCell = lastRow[lastRow.Count - 1].GetText(); Assert.Equal("Violent crime . . . . . . . . . . . . . . . . . .", firstRowFirstCell); Assert.Equal("(X)", lastRowLastCell); }
public void TestJSONSerializeInfinity() { string expectedJson = UtilsForTesting.LoadJson("Resources/json/schools.json"); PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/schools.pdf", new PdfRectangle(double.NaN, double.NaN, double.NaN, double.NaN)); // 53.74f, 16.97f, 548.74f, 762.3f); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); Table table = sea.Extract(page)[0]; //.get(0); StringBuilder sb = new StringBuilder(); (new JSONWriter()).Write(sb, table); string s = sb.ToString(); Assert.Equal(expectedJson, s); }
public void TestVerticalRulingsPreventMergingOfColumns() { List <Ruling> rulings = new List <Ruling>(); double[] rulingsVerticalPositions = { 147, 256, 310, 375, 431, 504 }; for (int i = 0; i < 6; i++) { rulings.Add(new Ruling(new PdfPoint(rulingsVerticalPositions[i], 40.43), new PdfPoint(rulingsVerticalPositions[i], 755))); } PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/campaign_donors.pdf", new PdfRectangle(40.43, 755 - (398.76 - 255.57), 557.35, 755)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(rulings); Table table = bea.Extract(page)[0]; var sixthRow = table.Rows[5]; Assert.Equal("VALSANGIACOMO BLANC", sixthRow[0].GetText()); Assert.Equal("OFERNANDO JORGE", sixthRow[1].GetText()); }
public void TestExtractColumnsCorrectly3() { PageArea page = UtilsForTesting.GetAreaFromFirstPage(FRX_2012_DISCLOSURE_PDF, new PdfRectangle(48.09, 563, 551.89, 685.5)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; var result = UtilsForTesting.TableToArrayOfRows(table); Assert.Equal(FRX_2012_DISCLOSURE_EXPECTED.Length, result.Length); for (int i = 0; i < FRX_2012_DISCLOSURE_EXPECTED.Length; i++) { var expecteds = FRX_2012_DISCLOSURE_EXPECTED[i]; var actuals = result[i]; Assert.Equal(expecteds.Length, actuals.Length); for (int j = 0; j < expecteds.Length; j++) { var e = expecteds[j]; var a = actuals[j]; Assert.Equal(e, a); } } }
public void TestColumnRecognition() { PageArea page = UtilsForTesting.GetAreaFromFirstPage(ARGENTINA_DIPUTADOS_VOTING_RECORD_PDF, new PdfRectangle(12.75, 55, 557, 567)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; var results = UtilsForTesting.TableToArrayOfRows(table); Assert.Equal(ARGENTINA_DIPUTADOS_VOTING_RECORD_EXPECTED.Length, results.Length); for (int i = 0; i < ARGENTINA_DIPUTADOS_VOTING_RECORD_EXPECTED.Length; i++) { var expected = ARGENTINA_DIPUTADOS_VOTING_RECORD_EXPECTED[i]; var result = results[i]; Assert.Equal(expected.Length, result.Length); for (int j = 0; j < expected.Length; j++) { var e = expected[j]; var r = result[j]; Assert.Equal(e, r); } } }
public void TestSpreadsheetExtractionIssue656() { // page height = 482, width 762.3 // 612 // top, left, bottom, right // 56.925f, 24.255f, 549.945f, 786.555f); PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/Publication_of_award_of_Bids_for_Transport_Sector__August_2016.pdf", new PdfRectangle(24.255, 71, 786.555, 553)); string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/Publication_of_award_of_Bids_for_Transport_Sector__August_2016.csv"); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); List <Table> tables = sea.Extract(page); Assert.Single(tables); Table table = tables[0]; StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, table); string result = sb.ToString(); Assert.Equal(expectedCsv.Replace("\n", "\r"), result.Replace("\r\n", "\n").Replace("\n", "\r").Trim()); /* * using (var stream = new MemoryStream()) * using (var sb = new StreamWriter(stream) { AutoFlush = true }) * { * (new CSVWriter()).write(sb, table); * * var reader = new StreamReader(stream); * stream.Position = 0; * var s = reader.ReadToEnd().Trim(); // trim to remove last new line * * // is there an issue with \r and \n? * Assert.Equal(expectedCsv.Replace("\n", "\r"), s.Replace("\r\n", "\n").Replace("\n", "\r")); * } */ }