public void TestExtractColumnsCorrectly() { if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) // || RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) { PageArea page = UtilsForTesting.GetAreaFromPage(EU_002_PDF, 1, new PdfRectangle(70.0, 725 - (233 - 115), 510.0, 725)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; var actualArray = UtilsForTesting.TableToArrayOfRows(table); Assert.Equal(EU_002_EXPECTED.Length, actualArray.Length); for (int i = 0; i < EU_002_EXPECTED.Length; i++) { var expecteds = EU_002_EXPECTED[i]; var actuals = actualArray[i]; Assert.Equal(expecteds.Length, actuals.Length); for (int j = 0; j < expecteds.Length; j++) { var e = expecteds[j]; var a = actuals[j]; Assert.Equal(e, a); } } } else { // fails on linux and mac os. Linked to PdfPig not finding the correct font. // need to use apt-get -y install ttf-mscorefonts-installer // still have mscorefonts - eula license could not be presented } }
public void TestExtractColumnsCorrectly2() { if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) // || RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) { PageArea page = UtilsForTesting.GetPage(EU_017_PDF, 3); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(page.VerticalRulings); Table table = bea.Extract(page.GetArea(new PdfRectangle(148.44, 543 - (711.875 - 299.625), 452.32, 543)))[0]; var result = UtilsForTesting.TableToArrayOfRows(table); Assert.Equal(EU_017_EXPECTED.Length, result.Length); for (int i = 0; i < EU_017_EXPECTED.Length; i++) { var expecteds = EU_017_EXPECTED[i]; var actuals = result[i]; Assert.Equal(expecteds.Length, actuals.Length); for (int j = 0; j < expecteds.Length; j++) { var e = expecteds[j]; var a = actuals[j]; Assert.Equal(e, a); } } } else { // fails on linux and mac os. Linked to PdfPig not finding the correct font. // need to use apt-get -y install ttf-mscorefonts-installer // still have mscorefonts - eula license could not be presented } }
private Table GetTable() { PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/argentina_diputados_voting_record.pdf", new PdfRectangle(12.75, 55.0, 561, 567)); // 269.875f, 12.75f, 790.5f, 561f); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); return(bea.Extract(page)[0]); }
public void TestEmptyRegion() { PageArea page = UtilsForTesting.GetAreaFromPage("Resources/indictb1h_14.pdf", 1, new PdfRectangle(0, 700, 100.9, 800)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; Assert.Equal(EXPECTED_EMPTY_TABLE, UtilsForTesting.TableToArrayOfRows(table)); }
public void TestRemoveSequentialSpaces() { PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/m27.pdf", new PdfRectangle(28.28, 532 - (103.04 - 79.2), 732.6, 532)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; var firstRow = table.Rows[0]; Assert.Equal("ALLEGIANT AIR", firstRow[1].GetText()); Assert.Equal("ALLEGIANT AIR LLC", firstRow[2].GetText()); }
public void TestCheckSqueezeDoesntBreak() { PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/12s0324.pdf", new PdfRectangle(17.25, 342, 410.25, 560.5)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; var rows = table.Rows; var firstRow = rows[0]; var firstRowFirstCell = firstRow[0].GetText(); var lastRow = rows[rows.Count - 1]; var lastRowLastCell = lastRow[lastRow.Count - 1].GetText(); Assert.Equal("Violent crime . . . . . . . . . . . . . . . . . .", firstRowFirstCell); Assert.Equal("(X)", lastRowLastCell); }
public void StreamNoGuess1() { PageArea page = UtilsForTesting.GetPage("Resources/data.pdf", 1); // data_stream_noguess.csv was modified for decimal precision string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/data_stream_noguess.csv"); BasicExtractionAlgorithm se = new BasicExtractionAlgorithm(); List <Table> tables = se.Extract(page); StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables[0]); Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n")); }
public void StreamNoGuess1() { // tabula.read_pdf(pdf_path, stream=True, guess=False) PageArea page = UtilsForTesting.GetPage("Resources/data.pdf", 1); // string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/data_stream_noguess.csv"); BasicExtractionAlgorithm se = new BasicExtractionAlgorithm(); List <Table> tables = se.Extract(page); StringBuilder sb = new StringBuilder(); (new CSVWriter()).Write(sb, tables[0]); Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n")); }
public void TestVerticalRulingsPreventMergingOfColumns() { List <Ruling> rulings = new List <Ruling>(); double[] rulingsVerticalPositions = { 147, 256, 310, 375, 431, 504 }; for (int i = 0; i < 6; i++) { rulings.Add(new Ruling(new PdfPoint(rulingsVerticalPositions[i], 40.43), new PdfPoint(rulingsVerticalPositions[i], 755))); } PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/campaign_donors.pdf", new PdfRectangle(40.43, 755 - (398.76 - 255.57), 557.35, 755)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(rulings); Table table = bea.Extract(page)[0]; var sixthRow = table.Rows[5]; Assert.Equal("VALSANGIACOMO BLANC", sixthRow[0].GetText()); Assert.Equal("OFERNANDO JORGE", sixthRow[1].GetText()); }
public void TestExtractColumnsCorrectly3() { PageArea page = UtilsForTesting.GetAreaFromFirstPage(FRX_2012_DISCLOSURE_PDF, new PdfRectangle(48.09, 563, 551.89, 685.5)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; var result = UtilsForTesting.TableToArrayOfRows(table); Assert.Equal(FRX_2012_DISCLOSURE_EXPECTED.Length, result.Length); for (int i = 0; i < FRX_2012_DISCLOSURE_EXPECTED.Length; i++) { var expecteds = FRX_2012_DISCLOSURE_EXPECTED[i]; var actuals = result[i]; Assert.Equal(expecteds.Length, actuals.Length); for (int j = 0; j < expecteds.Length; j++) { var e = expecteds[j]; var a = actuals[j]; Assert.Equal(e, a); } } }
public void TestLinesToCells() { using (PdfDocument document = PdfDocument.Open("test3.pdf", new ParsingOptions() { ClipPaths = true })) { ObjectExtractor oe = new ObjectExtractor(document); PageArea page = oe.Extract(1); SimpleNurminenDetectionAlgorithm detector = new SimpleNurminenDetectionAlgorithm(); var regions = detector.Detect(page); foreach (var a in regions) { IExtractionAlgorithm ea = new BasicExtractionAlgorithm(); var newArea = page.GetArea(a.BoundingBox); List <Table> tables = ea.Extract(newArea); } } }
public void TestTableWithMultilineHeader() { string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/us-020.csv"); PageArea page = UtilsForTesting.GetAreaFromPage("Resources/us-020.pdf", 2, new PdfRectangle(35.0, 151, 560, 688.5)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; using (var stream = new MemoryStream()) using (var sb = new StreamWriter(stream) { AutoFlush = true }) { (new CSVWriter()).Write(sb, table); var reader = new StreamReader(stream); stream.Position = 0; var data = reader.ReadToEnd().Replace("\r\n", "\n").Trim(); // trim to remove last new line Assert.Equal(expectedCsv, data); } }
public void TestRealLifeRTL2() { string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/indictb1h_14.csv"); PageArea page = UtilsForTesting.GetAreaFromPage("Resources/indictb1h_14.pdf", 1, new PdfRectangle(120.0, 842 - 622.82, 459.9, 842 - 120.0)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; using (var stream = new MemoryStream()) using (var sb = new StreamWriter(stream) { AutoFlush = true }) { (new CSVWriter()).Write(sb, table); var reader = new StreamReader(stream); stream.Position = 0; var data = reader.ReadToEnd().Replace("\r\n", "\n").Trim(); // trim to remove last new line Assert.Equal(expectedCsv, data); } }
public void TestColumnRecognition() { PageArea page = UtilsForTesting.GetAreaFromFirstPage(ARGENTINA_DIPUTADOS_VOTING_RECORD_PDF, new PdfRectangle(12.75, 55, 557, 567)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); Table table = bea.Extract(page)[0]; var results = UtilsForTesting.TableToArrayOfRows(table); Assert.Equal(ARGENTINA_DIPUTADOS_VOTING_RECORD_EXPECTED.Length, results.Length); for (int i = 0; i < ARGENTINA_DIPUTADOS_VOTING_RECORD_EXPECTED.Length; i++) { var expected = ARGENTINA_DIPUTADOS_VOTING_RECORD_EXPECTED[i]; var result = results[i]; Assert.Equal(expected.Length, result.Length); for (int j = 0; j < expected.Length; j++) { var e = expected[j]; var r = result[j]; Assert.Equal(e, r); } } }
public void TestNaturalOrderOfRectangles() { PageArea page = UtilsForTesting.GetPage("Resources/us-017.pdf", 2).GetArea(new PdfRectangle(90, 97, 532, 352)); BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(page.VerticalRulings); Table table = bea.Extract(page)[0]; IReadOnlyList <Cell> cells = table.Cells; foreach (var rectangularTextContainer in cells) { Debug.Print(rectangularTextContainer.GetText()); } // Now different form tabula-java, since PdfPig 0.1.5-alpha001 //Column headers Assert.Equal("Project", cells[0].GetText()); Assert.Equal("Agency", cells[1].GetText()); Assert.Equal("Institution", cells[2].GetText()); //First row Assert.Equal("Nanotechnology and its publics", cells[3].GetText()); Assert.Equal("NSF", cells[4].GetText()); Assert.Equal("Pennsylvania State University", cells[5].GetText()); //Second row Assert.Equal("Public information and deliberation in nanoscience and\rnanotechnology policy (SGER)", cells[6].GetText()); Assert.Equal("Interagency", cells[7].GetText()); Assert.Equal("North Carolina State\rUniversity", cells[8].GetText()); //Third row Assert.Equal("Social and ethical research and education in agrifood", cells[9].GetText()); Assert.Equal("nanotechnology (NIRT)", cells[10].GetText()); Assert.Equal("NSF", cells[11].GetText()); Assert.Equal("Michigan State University", cells[12].GetText()); //Fourth row Assert.Equal("From laboratory to society: developing an informed", cells[13].GetText()); Assert.Equal("approach to nanoscale science and engineering (NIRT)", cells[14].GetText()); Assert.Equal("NSF", cells[15].GetText()); Assert.Equal("University of South Carolina", cells[16].GetText()); //Fifth row Assert.Equal("Database and innovation timeline for nanotechnology", cells[17].GetText()); Assert.Equal("NSF", cells[18].GetText()); Assert.Equal("UCLA", cells[19].GetText()); //Sixth row Assert.Equal("Social and ethical dimensions of nanotechnology", cells[20].GetText()); Assert.Equal("NSF", cells[21].GetText()); Assert.Equal("University of Virginia", cells[22].GetText()); //Seventh row Assert.Equal("Undergraduate exploration of nanoscience,", cells[23].GetText()); Assert.Equal("applications and societal implications (NUE)", cells[24].GetText()); Assert.Equal("NSF", cells[25].GetText()); Assert.Equal("Michigan Technological\rUniversity", cells[26].GetText()); //Eighth row Assert.Equal("Ethics and belief inside the development of", cells[27].GetText()); Assert.Equal("nanotechnology (CAREER)", cells[28].GetText()); Assert.Equal("NSF", cells[29].GetText()); Assert.Equal("University of Virginia", cells[30].GetText()); //Ninth row Assert.Equal("All centers, NNIN and NCN have a societal", cells[31].GetText()); Assert.Equal("NSF, DOE,", cells[32].GetText()); Assert.Equal("All nanotechnology centers", cells[33].GetText()); Assert.Equal("implications components", cells[34].GetText()); Assert.Equal("DOD, and NIH", cells[35].GetText()); Assert.Equal("and networks", cells[36].GetText()); }