public void TestExtractColumnsCorrectly2() { if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) // || RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) { PageArea page = UtilsForTesting.GetPage(EU_017_PDF, 3); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(page.VerticalRulings); Table table = bea.Extract(page.GetArea(new PdfRectangle(148.44, 543 - (711.875 - 299.625), 452.32, 543)))[0]; var result = UtilsForTesting.TableToArrayOfRows(table); Assert.Equal(EU_017_EXPECTED.Length, result.Length); for (int i = 0; i < EU_017_EXPECTED.Length; i++) { var expecteds = EU_017_EXPECTED[i]; var actuals = result[i]; Assert.Equal(expecteds.Length, actuals.Length); for (int j = 0; j < expecteds.Length; j++) { var e = expecteds[j]; var a = actuals[j]; Assert.Equal(e, a); } } } else { // fails on linux and mac os. Linked to PdfPig not finding the correct font. // need to use apt-get -y install ttf-mscorefonts-installer // still have mscorefonts - eula license could not be presented } }
private List <Table> GetTables() { PageArea page = UtilsForTesting.GetPage("Resources/twotables.pdf", 1); SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); return(sea.Extract(page)); }
public void TestSpanningCells() { PageArea page = UtilsForTesting.GetPage("Resources/spanning_cells.pdf", 1); string expectedJson = UtilsForTesting.LoadJson("Resources/json/spanning_cells.json"); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); List <Table> tables = se.Extract(page); Assert.Equal(2, tables.Count); var expectedJObject = (JArray)JsonConvert.DeserializeObject(expectedJson); StringBuilder sb = new StringBuilder(); (new JSONWriter()).Write(sb, tables); var actualJObject = (JArray)JsonConvert.DeserializeObject(sb.ToString()); double pageHeight = 842; double precision = 2; for (int i = 0; i < 2; i++) { Assert.Equal(expectedJObject[i]["extraction_method"], actualJObject[i]["extraction_method"]); Assert.True(Math.Abs(Math.Floor(pageHeight - expectedJObject[i]["top"].Value <double>()) - Math.Floor(actualJObject[i]["top"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["left"].Value <double>()) - Math.Floor(actualJObject[i]["left"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["width"].Value <double>()) - Math.Floor(actualJObject[i]["width"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["height"].Value <double>()) - Math.Floor(actualJObject[i]["height"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["right"].Value <double>()) - Math.Floor(actualJObject[i]["right"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(pageHeight - expectedJObject[i]["bottom"].Value <double>()) - Math.Floor(actualJObject[i]["bottom"].Value <double>())) < precision); var expectedData = (JArray)expectedJObject[i]["data"]; var actualData = (JArray)actualJObject[i]["data"]; Assert.Equal(expectedData.Count, actualData.Count); for (int r = 0; r < expectedData.Count; r++) { var rowExpected = (JArray)expectedData[r]; var rowActual = (JArray)actualData[r]; Assert.Equal(rowExpected.Count, rowActual.Count); for (int c = 0; c < rowExpected.Count; c++) { var cellExpected = (JObject)rowExpected[c]; var cellActual = (JObject)rowActual[c]; if (string.IsNullOrEmpty(cellExpected["text"].Value <string>())) { continue; // empty cell have no coordinate data??? } Assert.True(Math.Abs(Math.Floor(pageHeight - cellExpected["top"].Value <double>()) - Math.Floor(cellActual["top"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(cellExpected["left"].Value <double>()) - Math.Floor(cellActual["left"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(cellExpected["width"].Value <double>()) - Math.Floor(cellActual["width"].Value <double>())) < precision); Assert.True(Math.Abs(Math.Floor(cellExpected["height"].Value <double>()) - Math.Floor(cellActual["height"].Value <double>())) < precision); Assert.Equal(cellExpected["text"].Value <string>(), cellActual["text"].Value <string>()); } } } //Assert.Equal(expectedJson, sb.ToString()); }
public void TestIncompleteGrid() { PageArea page = UtilsForTesting.GetPage("Resources/china.pdf", 1); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); List <Table> tables = se.Extract(page); Assert.Equal(2, tables.Count); }
public void TestSpanningCellsToCsv() { PageArea page = UtilsForTesting.GetPage("Resources/spanning_cells.pdf", 1); string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/spanning_cells.csv"); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); List <Table> tables = se.Extract(page); Assert.Equal(2, tables.Count); StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables); Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n").Trim()); }
public void TestNaturalOrderOfRectanglesDoesNotBreakContract() { PageArea page = UtilsForTesting.GetPage("Resources/us-017.pdf", 2); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); List <Table> tables = se.Extract(page); string expected = "Project,Agency,Institution\r\nNanotechnology and its publics,NSF,Pennsylvania State University\r\n\"Public information and deliberation in nanoscience and\rnanotechnology policy (SGER)\",Interagency,\"North Carolina State\rUniversity\"\r\n\"Social and ethical research and education in agrifood\rnanotechnology (NIRT)\",NSF,Michigan State University\r\n\"From laboratory to society: developing an informed\rapproach to nanoscale science and engineering (NIRT)\",NSF,University of South Carolina\r\nDatabase and innovation timeline for nanotechnology,NSF,UCLA\r\nSocial and ethical dimensions of nanotechnology,NSF,University of Virginia\r\n\"Undergraduate exploration of nanoscience,\rapplications and societal implications (NUE)\",NSF,\"Michigan Technological\rUniversity\"\r\n\"Ethics and belief inside the development of\rnanotechnology (CAREER)\",NSF,University of Virginia\r\n\"All centers, NNIN and NCN have a societal\rimplications components\",\"NSF, DOE,\rDOD, and NIH\",\"All nanotechnology centers\rand networks\""; // \r\n StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables[0]); string result = sb.ToString().Trim(); Assert.Equal(expected.Replace("\r\n", "\r"), result.Replace("\r\n", "\n").Replace("\n", "\r")); }
public void StreamNoGuess1() { PageArea page = UtilsForTesting.GetPage("Resources/data.pdf", 1); // data_stream_noguess.csv was modified for decimal precision string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/data_stream_noguess.csv"); BasicExtractionAlgorithm se = new BasicExtractionAlgorithm(); List <Table> tables = se.Extract(page); StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables[0]); Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n")); }
public void TestMergeLinesCloseToEachOther() { PageArea page = UtilsForTesting.GetPage("Resources/20.pdf", 1); IReadOnlyList <Ruling> rulings = page.VerticalRulings; Assert.Equal(6, rulings.Count); double[] expectedRulings = new double[] { 105.554812, 107.522417, 160.57705, 377.172662, 434.963828, 488.268507 }; var lefts = rulings.Select(x => x.Left).ToArray(); for (int i = 0; i < rulings.Count; i++) { Assert.Equal(expectedRulings[i], rulings[i].Left, 2); } }
public void StreamNoGuess1() { // tabula.read_pdf(pdf_path, stream=True, guess=False) PageArea page = UtilsForTesting.GetPage("Resources/data.pdf", 1); // string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/data_stream_noguess.csv"); BasicExtractionAlgorithm se = new BasicExtractionAlgorithm(); List <Table> tables = se.Extract(page); StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables[0]); Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n")); }
public void Latice1() { PageArea page = UtilsForTesting.GetPage("Resources/data.pdf", 1); // data_lattice.csv was modified to add the last row, missing in tabula_py string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/data_lattice.csv"); SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm(); List <Table> tables = se.Extract(page); Assert.Single(tables); StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables[0]); Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n")); }
public void TestNaturalOrderOfRectangles() { PageArea page = UtilsForTesting.GetPage("Resources/us-017.pdf", 2).GetArea(new PdfRectangle(90, 97, 532, 352)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(page.VerticalRulings); Table table = bea.Extract(page)[0]; IReadOnlyList <Cell> cells = table.Cells; foreach (var rectangularTextContainer in cells) { Debug.Print(rectangularTextContainer.GetText()); } // Now different form tabula-java, since PdfPig 0.1.5-alpha001 //Column headers Assert.Equal("Project", cells[0].GetText()); Assert.Equal("Agency", cells[1].GetText()); Assert.Equal("Institution", cells[2].GetText()); //First row Assert.Equal("Nanotechnology and its publics", cells[3].GetText()); Assert.Equal("NSF", cells[4].GetText()); Assert.Equal("Pennsylvania State University", cells[5].GetText()); //Second row Assert.Equal("Public information and deliberation in nanoscience and\rnanotechnology policy (SGER)", cells[6].GetText()); Assert.Equal("Interagency", cells[7].GetText()); Assert.Equal("North Carolina State\rUniversity", cells[8].GetText()); //Third row Assert.Equal("Social and ethical research and education in agrifood", cells[9].GetText()); Assert.Equal("nanotechnology (NIRT)", cells[10].GetText()); Assert.Equal("NSF", cells[11].GetText()); Assert.Equal("Michigan State University", cells[12].GetText()); //Fourth row Assert.Equal("From laboratory to society: developing an informed", cells[13].GetText()); Assert.Equal("approach to nanoscale science and engineering (NIRT)", cells[14].GetText()); Assert.Equal("NSF", cells[15].GetText()); Assert.Equal("University of South Carolina", cells[16].GetText()); //Fifth row Assert.Equal("Database and innovation timeline for nanotechnology", cells[17].GetText()); Assert.Equal("NSF", cells[18].GetText()); Assert.Equal("UCLA", cells[19].GetText()); //Sixth row Assert.Equal("Social and ethical dimensions of nanotechnology", cells[20].GetText()); Assert.Equal("NSF", cells[21].GetText()); Assert.Equal("University of Virginia", cells[22].GetText()); //Seventh row Assert.Equal("Undergraduate exploration of nanoscience,", cells[23].GetText()); Assert.Equal("applications and societal implications (NUE)", cells[24].GetText()); Assert.Equal("NSF", cells[25].GetText()); Assert.Equal("Michigan Technological\rUniversity", cells[26].GetText()); //Eighth row Assert.Equal("Ethics and belief inside the development of", cells[27].GetText()); Assert.Equal("nanotechnology (CAREER)", cells[28].GetText()); Assert.Equal("NSF", cells[29].GetText()); Assert.Equal("University of Virginia", cells[30].GetText()); //Ninth row Assert.Equal("All centers, NNIN and NCN have a societal", cells[31].GetText()); Assert.Equal("NSF, DOE,", cells[32].GetText()); Assert.Equal("All nanotechnology centers", cells[33].GetText()); Assert.Equal("implications components", cells[34].GetText()); Assert.Equal("DOD, and NIH", cells[35].GetText()); Assert.Equal("and networks", cells[36].GetText()); }