Пример #1
0
        public void TestExtractSpreadsheetWithinAnArea()
        {
            PageArea page = UtilsForTesting.GetAreaFromPage("Resources/puertos1.pdf", 1, new PdfRectangle(30.32142857142857, 793 - 554.8821428571429, 546.7964285714286, 793 - 273.9035714285714)); // 273.9035714285714f, 30.32142857142857f, 554.8821428571429f, 546.7964285714286f);
            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = se.Extract(page);
            Table        table  = tables[0];

            Assert.Equal(15, table.Rows.Count);

            const string expected = "\"\",TM,M.U$S,TM,M.U$S,TM,M.U$S,TM,M.U$S,TM,M.U$S,TM,M.U$S,TM\n" +
                                    "Peces vivos,1,25,1,23,2,38,1,37,2,67,2,89,1\n" +
                                    "\"Pescado fresco\n" +
                                    "o refrigerado.\n" +
                                    "exc. filetes\",7.704,7.175,8.931,6.892,12.635,10.255,16.742,13.688,14.357,11.674,13.035,13.429,9.727\n" +
                                    "\"Pescado congelado\n" +
                                    "exc. filetes\",90.560,105.950,112.645,108.416,132.895,115.874,152.767,133.765,148.882,134.847,156.619,165.134,137.179\n" +
                                    "\"Filetes y demás car-\n" +
                                    "nes de pescado\",105.434,200.563,151.142,218.389,152.174,227.780,178.123,291.863,169.422,313.735,176.427,381.640,144.814\n" +
                                    "\"Pescado sec./sal./\n" +
                                    "en salm. har./pol./\n" +
                                    "pell. aptos\n" +
                                    "p/c humano\",6.837,14.493,6.660,9.167,14.630,17.579,18.150,21.302,18.197,25.739,13.460,23.549,11.709\n" +
                                    "Crustáceos,61.691,375.798,52.488,251.043,47.635,387.783,27.815,217.443,7.123,86.019,39.488,373.583,45.191\n" +
                                    "Moluscos,162.027,174.507,109.436,111.443,90.834,104.741,57.695,109.141,98.182,206.304,187.023,251.352,157.531\n" +
                                    "\"Prod. no exp. en\n" +
                                    "otros capítulos.\n" +
                                    "No apto p/c humano\",203,328,7,35,521,343,\"1,710\",\"1,568\",125,246,124,263,131\n" +
                                    "\"Grasas y aceites de\n" +
                                    "pescado y mamíferos\n" +
                                    "marinos\",913,297,\"1,250\",476,\"1,031\",521,\"1,019\",642,690,483,489,710,959\n" +
                                    "\"Extractos y jugos de\n" +
                                    "pescado y mariscos\",5,25,1,3,4,4,31,93,39,117,77,230,80\n" +
                                    "\"Preparaciones y con-\n" +
                                    "servas de pescado\",846,\"3,737\",\"1,688\",\"4,411\",\"1,556\",\"3,681\",\"2,292\",\"5,474\",\"2,167\",\"7,494\",\"2,591\",\"8,833\",\"2,795\"\n" +
                                    "\"Preparaciones y con-\n" +
                                    "servas de mariscos\",348,\"3,667\",345,\"1,771\",738,\"3,627\",561,\"2,620\",607,\"3,928\",314,\"2,819\",250\n" +
                                    "\"Harina, polvo y pe-\n" +
                                    "llets de pescado.No\n" +
                                    "aptos p/c humano\",\"16,947\",\"8,547\",\"11,867\",\"6,315\",\"32,528\",\"13,985\",\"37,313\",\"18,989\",\"35,787\",\"19,914\",\"37,821\",\"27,174\",\"30,000\"\n" +
                                    "TOTAL,\"453,515\",\"895,111\",\"456,431\",\"718,382\",\"487,183\",\"886,211\",\"494,220\",\"816,623\",\"495,580\",\"810,565\",\"627,469\",\"1,248,804\",\"540,367\"\n";

            // TODO add better assertions
            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            string result = sb.ToString();

            //List<CSVRecord> parsedExpected = org.apache.commons.csv.CSVParser.parse(expected, CSVFormat.EXCEL).getRecords();
            //List<CSVRecord> parsedResult = org.apache.commons.csv.CSVParser.parse(result, CSVFormat.EXCEL).getRecords();
            using (var csv = new CsvReader(new StreamReader(new MemoryStream(Encoding.ASCII.GetBytes(result))), CultureInfo.InvariantCulture))
            {
                /*
                 * Assert.Equal(parsedResult.Count, parsedExpected.Count);
                 * for (int i = 0; i < parsedResult.Count; i++)
                 * {
                 *  Assert.Equal(parsedResult[i].size(), parsedExpected[i].size());
                 * }
                 */
            }
        }
Пример #2
0
        public void TestExtractColumnsCorrectly()
        {
            if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) // || RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
            {
                PageArea page = UtilsForTesting.GetAreaFromPage(EU_002_PDF, 1, new PdfRectangle(70.0, 725 - (233 - 115), 510.0, 725));
                BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
                Table table = bea.Extract(page)[0];

                var actualArray = UtilsForTesting.TableToArrayOfRows(table);
                Assert.Equal(EU_002_EXPECTED.Length, actualArray.Length);

                for (int i = 0; i < EU_002_EXPECTED.Length; i++)
                {
                    var expecteds = EU_002_EXPECTED[i];
                    var actuals   = actualArray[i];
                    Assert.Equal(expecteds.Length, actuals.Length);
                    for (int j = 0; j < expecteds.Length; j++)
                    {
                        var e = expecteds[j];
                        var a = actuals[j];
                        Assert.Equal(e, a);
                    }
                }
            }
            else
            {
                // fails on linux and mac os. Linked to PdfPig not finding the correct font.
                // need to use apt-get -y install ttf-mscorefonts-installer
                // still have mscorefonts - eula license could not be presented
            }
        }
Пример #3
0
        public void TestShouldDetectASingleSpreadsheet()
        {
            PageArea page = UtilsForTesting.GetAreaFromPage("Resources/offense.pdf", 1, new PdfRectangle(16.44, 792 - 680.85, 597.84, 792 - 16.44)); // 68.08f, 16.44f, 680.85f, 597.84f);
            SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = bea.Extract(page);

            Assert.Single(tables);
        }
Пример #4
0
        public void TestEmptyRegion()
        {
            PageArea page = UtilsForTesting.GetAreaFromPage("Resources/indictb1h_14.pdf", 1, new PdfRectangle(0, 700, 100.9, 800));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
            Table table = bea.Extract(page)[0];

            Assert.Equal(EXPECTED_EMPTY_TABLE, UtilsForTesting.TableToArrayOfRows(table));
        }
Пример #5
0
        public void TestSpreadsheetWithNoBoundingFrameShouldBeSpreadsheet()
        {
            PageArea page        = UtilsForTesting.GetAreaFromPage("Resources/spreadsheet_no_bounding_frame.pdf", 1, new PdfRectangle(58.9, 842 - 654.7, 536.12, 842 - 150.56)); // 842 - 150.56)); // 150.56f, 58.9f, 654.7f, 536.12f);
            string   expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/spreadsheet_no_bounding_frame.csv");

            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            bool isTabular = se.IsTabular(page);

            Assert.True(isTabular);
            List <Table> tables = se.Extract(page);

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            Assert.Equal(expectedCsv, sb.ToString());
        }
Пример #6
0
        public void TestTableWithMultilineHeader()
        {
            string   expectedCsv         = UtilsForTesting.LoadCsv("Resources/csv/us-020.csv");
            PageArea page                = UtilsForTesting.GetAreaFromPage("Resources/us-020.pdf", 2, new PdfRectangle(35.0, 151, 560, 688.5));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
            Table table = bea.Extract(page)[0];

            using (var stream = new MemoryStream())
                using (var sb = new StreamWriter(stream)
                {
                    AutoFlush = true
                })
                {
                    (new CSVWriter()).Write(sb, table);

                    var reader = new StreamReader(stream);
                    stream.Position = 0;
                    var data = reader.ReadToEnd().Replace("\r\n", "\n").Trim(); // trim to remove last new line

                    Assert.Equal(expectedCsv, data);
                }
        }
Пример #7
0
        public void TestRealLifeRTL2()
        {
            string   expectedCsv         = UtilsForTesting.LoadCsv("Resources/csv/indictb1h_14.csv");
            PageArea page                = UtilsForTesting.GetAreaFromPage("Resources/indictb1h_14.pdf", 1, new PdfRectangle(120.0, 842 - 622.82, 459.9, 842 - 120.0));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
            Table table = bea.Extract(page)[0];

            using (var stream = new MemoryStream())
                using (var sb = new StreamWriter(stream)
                {
                    AutoFlush = true
                })
                {
                    (new CSVWriter()).Write(sb, table);

                    var reader = new StreamReader(stream);
                    stream.Position = 0;
                    var data = reader.ReadToEnd().Replace("\r\n", "\n").Trim(); // trim to remove last new line

                    Assert.Equal(expectedCsv, data);
                }
        }