Exemplo n.º 1
        public void TestExtractColumnsCorrectly()
            if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) // || RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
                PageArea page = UtilsForTesting.GetAreaFromPage(EU_002_PDF, 1, new PdfRectangle(70.0, 725 - (233 - 115), 510.0, 725));
                BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
                Table table = bea.Extract(page)[0];

                var actualArray = UtilsForTesting.TableToArrayOfRows(table);
                Assert.Equal(EU_002_EXPECTED.Length, actualArray.Length);

                for (int i = 0; i < EU_002_EXPECTED.Length; i++)
                    var expecteds = EU_002_EXPECTED[i];
                    var actuals   = actualArray[i];
                    Assert.Equal(expecteds.Length, actuals.Length);
                    for (int j = 0; j < expecteds.Length; j++)
                        var e = expecteds[j];
                        var a = actuals[j];
                        Assert.Equal(e, a);
                // fails on linux and mac os. Linked to PdfPig not finding the correct font.
                // need to use apt-get -y install ttf-mscorefonts-installer
                // still have mscorefonts - eula license could not be presented
Exemplo n.º 2
        public void TestExtractColumnsCorrectly2()
            if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) // || RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
                PageArea page = UtilsForTesting.GetPage(EU_017_PDF, 3);
                BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(page.VerticalRulings);
                Table table = bea.Extract(page.GetArea(new PdfRectangle(148.44, 543 - (711.875 - 299.625), 452.32, 543)))[0];

                var result = UtilsForTesting.TableToArrayOfRows(table);

                Assert.Equal(EU_017_EXPECTED.Length, result.Length);
                for (int i = 0; i < EU_017_EXPECTED.Length; i++)
                    var expecteds = EU_017_EXPECTED[i];
                    var actuals   = result[i];
                    Assert.Equal(expecteds.Length, actuals.Length);
                    for (int j = 0; j < expecteds.Length; j++)
                        var e = expecteds[j];
                        var a = actuals[j];
                        Assert.Equal(e, a);
                // fails on linux and mac os. Linked to PdfPig not finding the correct font.
                // need to use apt-get -y install ttf-mscorefonts-installer
                // still have mscorefonts - eula license could not be presented
Exemplo n.º 3
        public void TestExtractSpreadsheetWithinAnArea()
            PageArea page = UtilsForTesting.GetAreaFromPage("Resources/puertos1.pdf", 1, new PdfRectangle(30.32142857142857, 793 - 554.8821428571429, 546.7964285714286, 793 - 273.9035714285714)); // 273.9035714285714f, 30.32142857142857f, 554.8821428571429f, 546.7964285714286f);
            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = se.Extract(page);
            Table        table  = tables[0];

            Assert.Equal(15, table.Rows.Count);

            const string expected = "\"\",TM,M.U$S,TM,M.U$S,TM,M.U$S,TM,M.U$S,TM,M.U$S,TM,M.U$S,TM\n" +
                                    "Peces vivos,1,25,1,23,2,38,1,37,2,67,2,89,1\n" +
                                    "\"Pescado fresco\n" +
                                    "o refrigerado.\n" +
                                    "exc. filetes\",7.704,7.175,8.931,6.892,12.635,10.255,16.742,13.688,14.357,11.674,13.035,13.429,9.727\n" +
                                    "\"Pescado congelado\n" +
                                    "exc. filetes\",90.560,105.950,112.645,108.416,132.895,115.874,152.767,133.765,148.882,134.847,156.619,165.134,137.179\n" +
                                    "\"Filetes y demás car-\n" +
                                    "nes de pescado\",105.434,200.563,151.142,218.389,152.174,227.780,178.123,291.863,169.422,313.735,176.427,381.640,144.814\n" +
                                    "\"Pescado sec./sal./\n" +
                                    "en salm. har./pol./\n" +
                                    "pell. aptos\n" +
                                    "p/c humano\",6.837,14.493,6.660,9.167,14.630,17.579,18.150,21.302,18.197,25.739,13.460,23.549,11.709\n" +
                                    "Crustáceos,61.691,375.798,52.488,251.043,47.635,387.783,27.815,217.443,7.123,86.019,39.488,373.583,45.191\n" +
                                    "Moluscos,162.027,174.507,109.436,111.443,90.834,104.741,57.695,109.141,98.182,206.304,187.023,251.352,157.531\n" +
                                    "\"Prod. no exp. en\n" +
                                    "otros capítulos.\n" +
                                    "No apto p/c humano\",203,328,7,35,521,343,\"1,710\",\"1,568\",125,246,124,263,131\n" +
                                    "\"Grasas y aceites de\n" +
                                    "pescado y mamíferos\n" +
                                    "marinos\",913,297,\"1,250\",476,\"1,031\",521,\"1,019\",642,690,483,489,710,959\n" +
                                    "\"Extractos y jugos de\n" +
                                    "pescado y mariscos\",5,25,1,3,4,4,31,93,39,117,77,230,80\n" +
                                    "\"Preparaciones y con-\n" +
                                    "servas de pescado\",846,\"3,737\",\"1,688\",\"4,411\",\"1,556\",\"3,681\",\"2,292\",\"5,474\",\"2,167\",\"7,494\",\"2,591\",\"8,833\",\"2,795\"\n" +
                                    "\"Preparaciones y con-\n" +
                                    "servas de mariscos\",348,\"3,667\",345,\"1,771\",738,\"3,627\",561,\"2,620\",607,\"3,928\",314,\"2,819\",250\n" +
                                    "\"Harina, polvo y pe-\n" +
                                    "llets de pescado.No\n" +
                                    "aptos p/c humano\",\"16,947\",\"8,547\",\"11,867\",\"6,315\",\"32,528\",\"13,985\",\"37,313\",\"18,989\",\"35,787\",\"19,914\",\"37,821\",\"27,174\",\"30,000\"\n" +

            // TODO add better assertions
            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            string result = sb.ToString();

            //List<CSVRecord> parsedExpected = org.apache.commons.csv.CSVParser.parse(expected, CSVFormat.EXCEL).getRecords();
            //List<CSVRecord> parsedResult = org.apache.commons.csv.CSVParser.parse(result, CSVFormat.EXCEL).getRecords();
            using (var csv = new CsvReader(new StreamReader(new MemoryStream(Encoding.ASCII.GetBytes(result))), CultureInfo.InvariantCulture))
                 * Assert.Equal(parsedResult.Count, parsedExpected.Count);
                 * for (int i = 0; i < parsedResult.Count; i++)
                 * {
                 *  Assert.Equal(parsedResult[i].size(), parsedExpected[i].size());
                 * }
Exemplo n.º 4
        public void TestCSVSerializeInfinity()
            string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/schools.csv");
            // top,    left,   bottom,  right              // page height = 612
            // 53.74f, 16.97f, 548.74f, 762.3f)

            PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/schools.pdf", new PdfRectangle(16.97, 612 - 548.74, 762.3, 612 - 53.74 - 1)); // remove 1 because add an empty line at the top if not
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            Table table = sea.Extract(page)[0];

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, table);
            string s = sb.ToString();

            Assert.Equal(expectedCsv.Trim(), s.Replace("\r\n", "\n"));

             * using (var stream = new MemoryStream())
             * using (var sb = new StreamWriter(stream) { AutoFlush = true })
             * {
             *  (new CSVWriter()).write(sb, table);
             *  var reader = new StreamReader(stream);
             *  stream.Position = 0;
             *  var s = reader.ReadToEnd().Trim(); // trim to remove last new line
             *  Assert.Equal(expectedCsv, s);
             * }
Exemplo n.º 5
        public void TestCSVWriter()
            string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/argentina_diputados_voting_record.csv");
            Table  table       = this.GetTable();

             * StringBuilder sb = new StringBuilder();
             * (new CSVWriter()).write(sb, table);
             * String s = sb.ToString();
             * String[] lines = s.Split("\\r?\\n");
             * assertEquals(EXPECTED_CSV_WRITER_OUTPUT, lines[0]);
             * assertEquals(expectedCsv, s);

            using (var stream = new MemoryStream())
                using (var sb = new StreamWriter(stream)
                    AutoFlush = true
                    (new CSVWriter()).Write(sb, table);

                    var reader = new StreamReader(stream);
                    stream.Position = 0;
                    var      s     = reader.ReadToEnd().Trim(); // trim to remove last new line
                    string[] lines = s.Split("\r\n");           // "\\r?\\n"
                    Assert.Equal(EXPECTED_CSV_WRITER_OUTPUT, lines[0]);
                    Assert.Equal(expectedCsv, s.Replace("\r\n", "\n"));
Exemplo n.º 6
        public void TestRealLifeRTL()
            PageArea page = UtilsForTesting.GetPage("Resources/mednine.pdf", 1);
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = sea.Extract(page);

            Table table = tables[0];
            var   rows  = table.Rows;

            Assert.Equal("الانتخابات التشريعية  2014", rows[0][0].GetText()); // the doubled spaces might be a bug in my implementation. // bobld: missing space or worng words order
            Assert.Equal("ورقة كشف نتائج دائرة مدنين", rows[1][0].GetText());
            Assert.Equal("426", rows[4][0].GetText());
            Assert.Equal("63", rows[4][1].GetText());
            Assert.Equal("43", rows[4][2].GetText());
            Assert.Equal("56", rows[4][3].GetText());
            Assert.Equal("58", rows[4][4].GetText());
            Assert.Equal("49", rows[4][5].GetText());
            Assert.Equal("55", rows[4][6].GetText());
            Assert.Equal("33", rows[4][7].GetText());
            Assert.Equal("32", rows[4][8].GetText());
            Assert.Equal("37", rows[4][9].GetText());
            Assert.Equal("قائمة من أجل تحقيق سلطة الشعب", rows[4][10].GetText());

            // there is one remaining problems that are not yet addressed
            // - diacritics (e.g. Arabic's tanwinً and probably Hebrew nekudot) are put in the wrong place.
            // this should get fixed, but this is a good first stab at the problem.

            // these (commented-out) tests reflect the theoretical correct answer,
            // which is not currently possible because of the two problems listed above
            //Assert.Equal("مرحباً", rows[0][0].getText()); // really ought to be ً, but this is forgiveable for now
Exemplo n.º 7
        public void TestCSVMultilineRow()
            string   expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/frx_2012_disclosure.csv");
            PageArea page        = UtilsForTesting.GetAreaFromFirstPage("Resources/frx_2012_disclosure.pdf", new PdfRectangle(double.NaN, double.NaN, double.NaN, double.NaN)); // 53.0f, 49.0f, 735.0f, 550.0f);
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            Table table = sea.Extract(page)[0];

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, table);
            string s = sb.ToString();

            Assert.Equal(expectedCsv, s);

             * using (var stream = new MemoryStream())
             * using (var sb = new StreamWriter(stream) { AutoFlush = true })
             * {
             *  (new CSVWriter()).write(sb, table);
             *  var reader = new StreamReader(stream);
             *  stream.Position = 0;
             *  var s = reader.ReadToEnd().Trim(); // trim to remove last new line
             *  Assert.Equal(expectedCsv, s);
             * }
Exemplo n.º 8
        private List <Table> GetTables()
            PageArea page = UtilsForTesting.GetPage("Resources/twotables.pdf", 1);
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();

Exemplo n.º 9
        public void TestCSVSerializeTwoTables()
            string       expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/twotables.csv");
            List <Table> tables      = this.GetTables();

             * StringBuilder sb = new StringBuilder();
             * (new CSVWriter()).write(sb, tables);
             * String s = sb.toString();
             * assertEquals(expectedCsv, s);

            using (var stream = new MemoryStream())
                using (var sb = new StreamWriter(stream)
                    AutoFlush = true
                    (new CSVWriter()).Write(sb, tables);
                    var reader = new StreamReader(stream);
                    stream.Position = 0;
                    var s = reader.ReadToEnd().Trim(); // trim to remove last new line
                    Assert.Equal(expectedCsv, s);
Exemplo n.º 10
        public void TestJSONSerializeTwoTables()
            string       expectedJson = UtilsForTesting.LoadJson("Resources/json/twotables.json");
            List <Table> tables       = this.GetTables();

            StringBuilder sb = new StringBuilder();

            (new JSONWriter()).Write(sb, tables);
            string s = sb.ToString();

            Assert.Equal(expectedJson, s);

             * using (var stream = new MemoryStream())
             * using (var sb = new StreamWriter(stream) { AutoFlush = true })
             * {
             *  (new JSONWriter()).write(sb, tables);
             *  var reader = new StreamReader(stream);
             *  stream.Position = 0;
             *  var s = reader.ReadToEnd();
             *  //File.WriteAllText("twotables_new.json", s);
             *  Assert.Equal(expectedJson, s);
             *  // Gson gson = new Gson();
             *  //JsonArray json = gson.fromJson(s, JsonArray.class);
             *  //assertEquals(2, json.size());
             *  var json = JsonConvert.DeserializeObject<List<Table>>(s);
             *  Assert.Equal(2, json.Count);
             * }
Exemplo n.º 11
        private Table GetTable()
            PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/argentina_diputados_voting_record.pdf", new PdfRectangle(12.75, 55.0, 561, 567)); // 269.875f, 12.75f, 790.5f, 561f);
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();

Exemplo n.º 12
        public void TestExtractTableWithExternallyDefinedRulings()
            PageArea page = UtilsForTesting.GetPage("Resources/us-007.pdf", 1);
            SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = bea.Extract(page, EXTERNALLY_DEFINED_RULINGS.ToList());

            Table table = tables[0];

            Assert.Equal(18, table.Cells.Count);

            var rows = table.Rows;

            Assert.Equal("Payroll Period", rows[0][0].GetText());
            Assert.Equal("One Withholding\rAllowance", rows[0][1].GetText());
            Assert.Equal("Weekly", rows[1][0].GetText());
            Assert.Equal("$71.15", rows[1][1].GetText());
            Assert.Equal("Biweekly", rows[2][0].GetText());
            Assert.Equal("142.31", rows[2][1].GetText());
            Assert.Equal("Semimonthly", rows[3][0].GetText());
            Assert.Equal("154.17", rows[3][1].GetText());
            Assert.Equal("Monthly", rows[4][0].GetText());
            Assert.Equal("308.33", rows[4][1].GetText());
            Assert.Equal("Quarterly", rows[5][0].GetText());
            Assert.Equal("925.00", rows[5][1].GetText());
            Assert.Equal("Semiannually", rows[6][0].GetText());
            Assert.Equal("1,850.00", rows[6][1].GetText());
            Assert.Equal("Annually", rows[7][0].GetText());
            Assert.Equal("3,700.00", rows[7][1].GetText());
            Assert.Equal("Daily or Miscellaneous\r(each day of the payroll period)", rows[8][0].GetText());
            Assert.Equal("14.23", rows[8][1].GetText());
Exemplo n.º 13
        public void TestRTL()
            PageArea page = UtilsForTesting.GetPage("Resources/arabic.pdf", 1);
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = sea.Extract(page);
            // Assert.Equal(1, tables.size());
            Table table = tables[0];

            var rows = table.Rows;

            Assert.Equal("اسمي سلطان", rows[1][1].GetText());
            Assert.Equal("من اين انت؟", rows[2][1].GetText());
            Assert.Equal("1234", rows[3][0].GetText());
            Assert.Equal("هل انت شباك؟", rows[4][0].GetText());
            Assert.Equal("انا من ولاية كارولينا الشمال", rows[2][0].GetText()); // conjoined lam-alif gets missed
            Assert.Equal("اسمي Jeremy في الانجليزية", rows[4][1].GetText());    // conjoined lam-alif gets missed
            Assert.Equal("عندي 47 قطط", rows[3][1].GetText());                  // the real right answer is 47.
            Assert.Equal("Jeremy is جرمي in Arabic", rows[5][0].GetText());     // the real right answer is 47.
            Assert.Equal("مرحباً", rows[1][0].GetText());                       // really ought to be ً, but this is forgiveable for now

            // there is one remaining problems that are not yet addressed
            // - diacritics (e.g. Arabic's tanwinً and probably Hebrew nekudot) are put in the wrong place.
            // this should get fixed, but this is a good first stab at the problem.

            // these (commented-out) tests reflect the theoretical correct answer,
            // which is not currently possible because of the two problems listed above
            // Assert.Equal("مرحباً",                       table.getRows()[0][0].getText()); // really ought to be ً, but this is forgiveable for now
Exemplo n.º 14
        public void TestSpanningCells()
            PageArea page                     = UtilsForTesting.GetPage("Resources/spanning_cells.pdf", 1);
            string   expectedJson             = UtilsForTesting.LoadJson("Resources/json/spanning_cells.json");
            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            List <Table> tables               = se.Extract(page);

            Assert.Equal(2, tables.Count);

            var expectedJObject = (JArray)JsonConvert.DeserializeObject(expectedJson);

            StringBuilder sb = new StringBuilder();

            (new JSONWriter()).Write(sb, tables);
            var actualJObject = (JArray)JsonConvert.DeserializeObject(sb.ToString());

            double pageHeight = 842;
            double precision  = 2;

            for (int i = 0; i < 2; i++)
                Assert.Equal(expectedJObject[i]["extraction_method"], actualJObject[i]["extraction_method"]);

                Assert.True(Math.Abs(Math.Floor(pageHeight - expectedJObject[i]["top"].Value <double>()) - Math.Floor(actualJObject[i]["top"].Value <double>())) < precision);
                Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["left"].Value <double>()) - Math.Floor(actualJObject[i]["left"].Value <double>())) < precision);
                Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["width"].Value <double>()) - Math.Floor(actualJObject[i]["width"].Value <double>())) < precision);
                Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["height"].Value <double>()) - Math.Floor(actualJObject[i]["height"].Value <double>())) < precision);
                Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["right"].Value <double>()) - Math.Floor(actualJObject[i]["right"].Value <double>())) < precision);
                Assert.True(Math.Abs(Math.Floor(pageHeight - expectedJObject[i]["bottom"].Value <double>()) - Math.Floor(actualJObject[i]["bottom"].Value <double>())) < precision);

                var expectedData = (JArray)expectedJObject[i]["data"];
                var actualData   = (JArray)actualJObject[i]["data"];
                Assert.Equal(expectedData.Count, actualData.Count);

                for (int r = 0; r < expectedData.Count; r++)
                    var rowExpected = (JArray)expectedData[r];
                    var rowActual   = (JArray)actualData[r];
                    Assert.Equal(rowExpected.Count, rowActual.Count);

                    for (int c = 0; c < rowExpected.Count; c++)
                        var cellExpected = (JObject)rowExpected[c];
                        var cellActual   = (JObject)rowActual[c];

                        if (string.IsNullOrEmpty(cellExpected["text"].Value <string>()))
                            continue;                                                             // empty cell have no coordinate data???
                        Assert.True(Math.Abs(Math.Floor(pageHeight - cellExpected["top"].Value <double>()) - Math.Floor(cellActual["top"].Value <double>())) < precision);
                        Assert.True(Math.Abs(Math.Floor(cellExpected["left"].Value <double>()) - Math.Floor(cellActual["left"].Value <double>())) < precision);
                        Assert.True(Math.Abs(Math.Floor(cellExpected["width"].Value <double>()) - Math.Floor(cellActual["width"].Value <double>())) < precision);
                        Assert.True(Math.Abs(Math.Floor(cellExpected["height"].Value <double>()) - Math.Floor(cellActual["height"].Value <double>())) < precision);
                        Assert.Equal(cellExpected["text"].Value <string>(), cellActual["text"].Value <string>());
            //Assert.Equal(expectedJson, sb.ToString());
Exemplo n.º 15
        public void TestIncompleteGrid()
            PageArea page = UtilsForTesting.GetPage("Resources/china.pdf", 1);
            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = se.Extract(page);

            Assert.Equal(2, tables.Count);
Exemplo n.º 16
        public void TestShouldDetectASingleSpreadsheet()
            PageArea page = UtilsForTesting.GetAreaFromPage("Resources/offense.pdf", 1, new PdfRectangle(16.44, 792 - 680.85, 597.84, 792 - 16.44)); // 68.08f, 16.44f, 680.85f, 597.84f);
            SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = bea.Extract(page);

Exemplo n.º 17
        public void TestEmptyRegion()
            PageArea page = UtilsForTesting.GetAreaFromPage("Resources/indictb1h_14.pdf", 1, new PdfRectangle(0, 700, 100.9, 800));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
            Table table = bea.Extract(page)[0];

            Assert.Equal(EXPECTED_EMPTY_TABLE, UtilsForTesting.TableToArrayOfRows(table));
Exemplo n.º 18
        public void TestRemoveSequentialSpaces()
            PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/m27.pdf", new PdfRectangle(28.28, 532 - (103.04 - 79.2), 732.6, 532));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
            Table table    = bea.Extract(page)[0];
            var   firstRow = table.Rows[0];

            Assert.Equal("ALLEGIANT AIR", firstRow[1].GetText());
            Assert.Equal("ALLEGIANT AIR LLC", firstRow[2].GetText());
Exemplo n.º 19
        public void TestExtractColumnsCorrectly3()
            // top,     left,   bottom,  right
            // 106.01f, 48.09f, 227.31f, 551.89f
            // bottom = 792 - 227.31 = 564.69
            // top =  792 - 106.01 = 685.99
            PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/frx_2012_disclosure.pdf", new PdfRectangle(48.09, 564.69, 551.89, 684.99)); // changed 685.99 to 684.99 because was adding an empty row at the top
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            Table table = sea.Extract(page)[0];

            Assert.Equal("REGIONAL PULMONARY & SLEEP\rMEDICINE", table.Rows[8][1].GetText());
Exemplo n.º 20
        public void TestDontStackOverflowQuicksort()
            PageArea page = UtilsForTesting.GetPage("Resources/failing_sort.pdf", 1);

            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = sea.Extract(page);

            for (int i = 1; i < tables.Count; i++)
                Assert.True(tables[i - 1].Top >= tables[i].Top); //Assert.True(tables[i - 1].getTop() <= tables[i].getTop());
Exemplo n.º 21
        public void TestAnotherExtractTableWithExternallyDefinedRulings()
            PageArea page = UtilsForTesting.GetPage("Resources/us-024.pdf", 1);
            SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = bea.Extract(page, EXTERNALLY_DEFINED_RULINGS2.ToList());

            Table table = tables[0];

            Assert.Equal("Total Supply", table.Rows[4][0].GetText());
            Assert.Equal("6.6", table.Rows[6][2].GetText());
Exemplo n.º 22
        public void TestSpreadsheetsSortedByTopAndRight()
            PageArea page = UtilsForTesting.GetPage("Resources/sydney_disclosure_contract.pdf", 1);

            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = sea.Extract(page);

            for (int i = 1; i < tables.Count; i++)
                Assert.True(tables[i - 1].Top >= tables[i].Top); // Assert.True(tables[i - 1].getTop() <= tables[i].getTop());
Exemplo n.º 23
        public void TestJSONSerializeInfinity()
            string   expectedJson = UtilsForTesting.LoadJson("Resources/json/schools.json");
            PageArea page         = UtilsForTesting.GetAreaFromFirstPage("Resources/schools.pdf", new PdfRectangle(double.NaN, double.NaN, double.NaN, double.NaN)); // 53.74f, 16.97f, 548.74f, 762.3f);
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            Table table = sea.Extract(page)[0];                                                                                                                      //.get(0);

            StringBuilder sb = new StringBuilder();

            (new JSONWriter()).Write(sb, table);
            string s = sb.ToString();

            Assert.Equal(expectedJson, s);
Exemplo n.º 24
        public void TestCheckSqueezeDoesntBreak()
            PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/12s0324.pdf", new PdfRectangle(17.25, 342, 410.25, 560.5));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
            Table table             = bea.Extract(page)[0];
            var   rows              = table.Rows;
            var   firstRow          = rows[0];
            var   firstRowFirstCell = firstRow[0].GetText();
            var   lastRow           = rows[rows.Count - 1];
            var   lastRowLastCell   = lastRow[lastRow.Count - 1].GetText();

            Assert.Equal("Violent crime  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .", firstRowFirstCell);
            Assert.Equal("(X)", lastRowLastCell);
Exemplo n.º 25
        public void TestSpanningCellsToCsv()
            PageArea page        = UtilsForTesting.GetPage("Resources/spanning_cells.pdf", 1);
            string   expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/spanning_cells.csv");
            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = se.Extract(page);

            Assert.Equal(2, tables.Count);

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables);
            Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n").Trim());
Exemplo n.º 26
        public void StreamNoGuess1()
            PageArea page = UtilsForTesting.GetPage("Resources/data.pdf", 1);
            // data_stream_noguess.csv was modified for decimal precision
            string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/data_stream_noguess.csv");

            BasicExtractionAlgorithm se = new BasicExtractionAlgorithm();

            List <Table> tables = se.Extract(page);

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n"));
Exemplo n.º 27
        public void TestNaturalOrderOfRectanglesDoesNotBreakContract()
            PageArea page = UtilsForTesting.GetPage("Resources/us-017.pdf", 2);
            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = se.Extract(page);

            string expected = "Project,Agency,Institution\r\nNanotechnology and its publics,NSF,Pennsylvania State University\r\n\"Public information and deliberation in nanoscience and\rnanotechnology policy (SGER)\",Interagency,\"North Carolina State\rUniversity\"\r\n\"Social and ethical research and education in agrifood\rnanotechnology (NIRT)\",NSF,Michigan State University\r\n\"From laboratory to society: developing an informed\rapproach to nanoscale science and engineering (NIRT)\",NSF,University of South Carolina\r\nDatabase and innovation timeline for nanotechnology,NSF,UCLA\r\nSocial and ethical dimensions of nanotechnology,NSF,University of Virginia\r\n\"Undergraduate exploration of nanoscience,\rapplications and societal implications (NUE)\",NSF,\"Michigan Technological\rUniversity\"\r\n\"Ethics and belief inside the development of\rnanotechnology (CAREER)\",NSF,University of Virginia\r\n\"All centers, NNIN and NCN have a societal\rimplications components\",\"NSF, DOE,\rDOD, and NIH\",\"All nanotechnology centers\rand networks\""; // \r\n

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            string result = sb.ToString().Trim();

            Assert.Equal(expected.Replace("\r\n", "\r"), result.Replace("\r\n", "\n").Replace("\n", "\r"));
Exemplo n.º 28
        public void TestSpreadsheetWithNoBoundingFrameShouldBeSpreadsheet()
            PageArea page        = UtilsForTesting.GetAreaFromPage("Resources/spreadsheet_no_bounding_frame.pdf", 1, new PdfRectangle(58.9, 842 - 654.7, 536.12, 842 - 150.56)); // 842 - 150.56)); // 150.56f, 58.9f, 654.7f, 536.12f);
            string   expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/spreadsheet_no_bounding_frame.csv");

            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            bool isTabular = se.IsTabular(page);

            List <Table> tables = se.Extract(page);

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            Assert.Equal(expectedCsv, sb.ToString());
Exemplo n.º 29
        public void TestMergeLinesCloseToEachOther()
            PageArea page = UtilsForTesting.GetPage("Resources/20.pdf", 1);
            IReadOnlyList <Ruling> rulings = page.VerticalRulings;

            Assert.Equal(6, rulings.Count);

            double[] expectedRulings = new double[] { 105.554812, 107.522417, 160.57705, 377.172662, 434.963828, 488.268507 };

            var lefts = rulings.Select(x => x.Left).ToArray();

            for (int i = 0; i < rulings.Count; i++)
                Assert.Equal(expectedRulings[i], rulings[i].Left, 2);
Exemplo n.º 30
        public void StreamNoGuess1()
            // tabula.read_pdf(pdf_path, stream=True, guess=False)

            PageArea page = UtilsForTesting.GetPage("Resources/data.pdf", 1);
            string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/data_stream_noguess.csv");

            BasicExtractionAlgorithm se = new BasicExtractionAlgorithm();

            List <Table> tables = se.Extract(page);

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n"));