public void TestExtractColumnsCorrectly2()
        {
            if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) // || RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
            {
                PageArea page = UtilsForTesting.GetPage(EU_017_PDF, 3);
                BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(page.VerticalRulings);
                Table table = bea.Extract(page.GetArea(new PdfRectangle(148.44, 543 - (711.875 - 299.625), 452.32, 543)))[0];

                var result = UtilsForTesting.TableToArrayOfRows(table);

                Assert.Equal(EU_017_EXPECTED.Length, result.Length);
                for (int i = 0; i < EU_017_EXPECTED.Length; i++)
                {
                    var expecteds = EU_017_EXPECTED[i];
                    var actuals   = result[i];
                    Assert.Equal(expecteds.Length, actuals.Length);
                    for (int j = 0; j < expecteds.Length; j++)
                    {
                        var e = expecteds[j];
                        var a = actuals[j];
                        Assert.Equal(e, a);
                    }
                }
            }
            else
            {
                // fails on linux and mac os. Linked to PdfPig not finding the correct font.
                // need to use apt-get -y install ttf-mscorefonts-installer
                // still have mscorefonts - eula license could not be presented
            }
        }
Beispiel #2
0
        private List <Table> GetTables()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/twotables.pdf", 1);
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();

            return(sea.Extract(page));
        }
Beispiel #3
0
        public void TestSpanningCells()
        {
            PageArea page                     = UtilsForTesting.GetPage("Resources/spanning_cells.pdf", 1);
            string   expectedJson             = UtilsForTesting.LoadJson("Resources/json/spanning_cells.json");
            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            List <Table> tables               = se.Extract(page);

            Assert.Equal(2, tables.Count);

            var expectedJObject = (JArray)JsonConvert.DeserializeObject(expectedJson);

            StringBuilder sb = new StringBuilder();

            (new JSONWriter()).Write(sb, tables);
            var actualJObject = (JArray)JsonConvert.DeserializeObject(sb.ToString());

            double pageHeight = 842;
            double precision  = 2;

            for (int i = 0; i < 2; i++)
            {
                Assert.Equal(expectedJObject[i]["extraction_method"], actualJObject[i]["extraction_method"]);

                Assert.True(Math.Abs(Math.Floor(pageHeight - expectedJObject[i]["top"].Value <double>()) - Math.Floor(actualJObject[i]["top"].Value <double>())) < precision);
                Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["left"].Value <double>()) - Math.Floor(actualJObject[i]["left"].Value <double>())) < precision);
                Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["width"].Value <double>()) - Math.Floor(actualJObject[i]["width"].Value <double>())) < precision);
                Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["height"].Value <double>()) - Math.Floor(actualJObject[i]["height"].Value <double>())) < precision);
                Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["right"].Value <double>()) - Math.Floor(actualJObject[i]["right"].Value <double>())) < precision);
                Assert.True(Math.Abs(Math.Floor(pageHeight - expectedJObject[i]["bottom"].Value <double>()) - Math.Floor(actualJObject[i]["bottom"].Value <double>())) < precision);

                var expectedData = (JArray)expectedJObject[i]["data"];
                var actualData   = (JArray)actualJObject[i]["data"];
                Assert.Equal(expectedData.Count, actualData.Count);

                for (int r = 0; r < expectedData.Count; r++)
                {
                    var rowExpected = (JArray)expectedData[r];
                    var rowActual   = (JArray)actualData[r];
                    Assert.Equal(rowExpected.Count, rowActual.Count);

                    for (int c = 0; c < rowExpected.Count; c++)
                    {
                        var cellExpected = (JObject)rowExpected[c];
                        var cellActual   = (JObject)rowActual[c];

                        if (string.IsNullOrEmpty(cellExpected["text"].Value <string>()))
                        {
                            continue;                                                             // empty cell have no coordinate data???
                        }
                        Assert.True(Math.Abs(Math.Floor(pageHeight - cellExpected["top"].Value <double>()) - Math.Floor(cellActual["top"].Value <double>())) < precision);
                        Assert.True(Math.Abs(Math.Floor(cellExpected["left"].Value <double>()) - Math.Floor(cellActual["left"].Value <double>())) < precision);
                        Assert.True(Math.Abs(Math.Floor(cellExpected["width"].Value <double>()) - Math.Floor(cellActual["width"].Value <double>())) < precision);
                        Assert.True(Math.Abs(Math.Floor(cellExpected["height"].Value <double>()) - Math.Floor(cellActual["height"].Value <double>())) < precision);
                        Assert.Equal(cellExpected["text"].Value <string>(), cellActual["text"].Value <string>());
                    }
                }
            }
            //Assert.Equal(expectedJson, sb.ToString());
        }
Beispiel #4
0
        public void TestIncompleteGrid()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/china.pdf", 1);
            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = se.Extract(page);

            Assert.Equal(2, tables.Count);
        }
Beispiel #5
0
        public void TestSpanningCellsToCsv()
        {
            PageArea page        = UtilsForTesting.GetPage("Resources/spanning_cells.pdf", 1);
            string   expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/spanning_cells.csv");
            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = se.Extract(page);

            Assert.Equal(2, tables.Count);

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables);
            Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n").Trim());
        }
Beispiel #6
0
        public void TestNaturalOrderOfRectanglesDoesNotBreakContract()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/us-017.pdf", 2);
            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = se.Extract(page);

            string expected = "Project,Agency,Institution\r\nNanotechnology and its publics,NSF,Pennsylvania State University\r\n\"Public information and deliberation in nanoscience and\rnanotechnology policy (SGER)\",Interagency,\"North Carolina State\rUniversity\"\r\n\"Social and ethical research and education in agrifood\rnanotechnology (NIRT)\",NSF,Michigan State University\r\n\"From laboratory to society: developing an informed\rapproach to nanoscale science and engineering (NIRT)\",NSF,University of South Carolina\r\nDatabase and innovation timeline for nanotechnology,NSF,UCLA\r\nSocial and ethical dimensions of nanotechnology,NSF,University of Virginia\r\n\"Undergraduate exploration of nanoscience,\rapplications and societal implications (NUE)\",NSF,\"Michigan Technological\rUniversity\"\r\n\"Ethics and belief inside the development of\rnanotechnology (CAREER)\",NSF,University of Virginia\r\n\"All centers, NNIN and NCN have a societal\rimplications components\",\"NSF, DOE,\rDOD, and NIH\",\"All nanotechnology centers\rand networks\""; // \r\n

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            string result = sb.ToString().Trim();

            Assert.Equal(expected.Replace("\r\n", "\r"), result.Replace("\r\n", "\n").Replace("\n", "\r"));
        }
Beispiel #7
0
        public void StreamNoGuess1()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/data.pdf", 1);
            // data_stream_noguess.csv was modified for decimal precision
            string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/data_stream_noguess.csv");

            BasicExtractionAlgorithm se = new BasicExtractionAlgorithm();

            List <Table> tables = se.Extract(page);

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n"));
        }
Beispiel #8
0
        public void TestMergeLinesCloseToEachOther()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/20.pdf", 1);
            IReadOnlyList <Ruling> rulings = page.VerticalRulings;

            Assert.Equal(6, rulings.Count);

            double[] expectedRulings = new double[] { 105.554812, 107.522417, 160.57705, 377.172662, 434.963828, 488.268507 };

            var lefts = rulings.Select(x => x.Left).ToArray();

            for (int i = 0; i < rulings.Count; i++)
            {
                Assert.Equal(expectedRulings[i], rulings[i].Left, 2);
            }
        }
Beispiel #9
0
        public void StreamNoGuess1()
        {
            // tabula.read_pdf(pdf_path, stream=True, guess=False)

            PageArea page = UtilsForTesting.GetPage("Resources/data.pdf", 1);
            //
            string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/data_stream_noguess.csv");

            BasicExtractionAlgorithm se = new BasicExtractionAlgorithm();

            List <Table> tables = se.Extract(page);

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n"));
        }
Beispiel #10
0
        public void Latice1()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/data.pdf", 1);
            // data_lattice.csv was modified to add the last row, missing in tabula_py
            string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/data_lattice.csv");

            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();

            List <Table> tables = se.Extract(page);

            Assert.Single(tables);

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n"));
        }
        public void TestNaturalOrderOfRectangles()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/us-017.pdf", 2).GetArea(new PdfRectangle(90, 97, 532, 352));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(page.VerticalRulings);
            Table table = bea.Extract(page)[0];

            IReadOnlyList <Cell> cells = table.Cells;

            foreach (var rectangularTextContainer in cells)
            {
                Debug.Print(rectangularTextContainer.GetText());
            }

            // Now different form tabula-java, since PdfPig 0.1.5-alpha001

            //Column headers
            Assert.Equal("Project", cells[0].GetText());
            Assert.Equal("Agency", cells[1].GetText());
            Assert.Equal("Institution", cells[2].GetText());

            //First row
            Assert.Equal("Nanotechnology and its publics", cells[3].GetText());
            Assert.Equal("NSF", cells[4].GetText());
            Assert.Equal("Pennsylvania State University", cells[5].GetText());

            //Second row
            Assert.Equal("Public information and deliberation in nanoscience and\rnanotechnology policy (SGER)", cells[6].GetText());
            Assert.Equal("Interagency", cells[7].GetText());
            Assert.Equal("North Carolina State\rUniversity", cells[8].GetText());

            //Third row
            Assert.Equal("Social and ethical research and education in agrifood", cells[9].GetText());
            Assert.Equal("nanotechnology (NIRT)", cells[10].GetText());
            Assert.Equal("NSF", cells[11].GetText());
            Assert.Equal("Michigan State University", cells[12].GetText());

            //Fourth row
            Assert.Equal("From laboratory to society: developing an informed", cells[13].GetText());
            Assert.Equal("approach to nanoscale science and engineering (NIRT)", cells[14].GetText());
            Assert.Equal("NSF", cells[15].GetText());
            Assert.Equal("University of South Carolina", cells[16].GetText());

            //Fifth row
            Assert.Equal("Database and innovation timeline for nanotechnology", cells[17].GetText());
            Assert.Equal("NSF", cells[18].GetText());
            Assert.Equal("UCLA", cells[19].GetText());

            //Sixth row
            Assert.Equal("Social and ethical dimensions of nanotechnology", cells[20].GetText());
            Assert.Equal("NSF", cells[21].GetText());
            Assert.Equal("University of Virginia", cells[22].GetText());

            //Seventh row
            Assert.Equal("Undergraduate exploration of nanoscience,", cells[23].GetText());
            Assert.Equal("applications and societal implications (NUE)", cells[24].GetText());
            Assert.Equal("NSF", cells[25].GetText());
            Assert.Equal("Michigan Technological\rUniversity", cells[26].GetText());

            //Eighth row
            Assert.Equal("Ethics and belief inside the development of", cells[27].GetText());
            Assert.Equal("nanotechnology (CAREER)", cells[28].GetText());
            Assert.Equal("NSF", cells[29].GetText());
            Assert.Equal("University of Virginia", cells[30].GetText());

            //Ninth row
            Assert.Equal("All centers, NNIN and NCN have a societal", cells[31].GetText());
            Assert.Equal("NSF, DOE,", cells[32].GetText());
            Assert.Equal("All nanotechnology centers", cells[33].GetText());
            Assert.Equal("implications components", cells[34].GetText());
            Assert.Equal("DOD, and NIH", cells[35].GetText());
            Assert.Equal("and networks", cells[36].GetText());
        }