Esempio n. 1
0
        public void TestCSVWriter()
        {
            string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/argentina_diputados_voting_record.csv");
            Table  table       = this.GetTable();

            /*
             * StringBuilder sb = new StringBuilder();
             * (new CSVWriter()).write(sb, table);
             * String s = sb.ToString();
             * String[] lines = s.Split("\\r?\\n");
             * assertEquals(EXPECTED_CSV_WRITER_OUTPUT, lines[0]);
             * assertEquals(expectedCsv, s);
             */

            using (var stream = new MemoryStream())
                using (var sb = new StreamWriter(stream)
                {
                    AutoFlush = true
                })
                {
                    (new CSVWriter()).Write(sb, table);

                    var reader = new StreamReader(stream);
                    stream.Position = 0;
                    var      s     = reader.ReadToEnd().Trim(); // trim to remove last new line
                    string[] lines = s.Split("\r\n");           // "\\r?\\n"
                    Assert.Equal(EXPECTED_CSV_WRITER_OUTPUT, lines[0]);
                    Assert.Equal(expectedCsv, s.Replace("\r\n", "\n"));
                }
        }
Esempio n. 2
0
        public void TestCSVMultilineRow()
        {
            string   expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/frx_2012_disclosure.csv");
            PageArea page        = UtilsForTesting.GetAreaFromFirstPage("Resources/frx_2012_disclosure.pdf", new PdfRectangle(double.NaN, double.NaN, double.NaN, double.NaN)); // 53.0f, 49.0f, 735.0f, 550.0f);
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            Table table = sea.Extract(page)[0];

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, table);
            string s = sb.ToString();

            Assert.Equal(expectedCsv, s);

            /*
             * using (var stream = new MemoryStream())
             * using (var sb = new StreamWriter(stream) { AutoFlush = true })
             * {
             *  (new CSVWriter()).write(sb, table);
             *  var reader = new StreamReader(stream);
             *  stream.Position = 0;
             *  var s = reader.ReadToEnd().Trim(); // trim to remove last new line
             *  Assert.Equal(expectedCsv, s);
             * }
             */
        }
Esempio n. 3
0
        public void TestCSVSerializeTwoTables()
        {
            string       expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/twotables.csv");
            List <Table> tables      = this.GetTables();

            /*
             * StringBuilder sb = new StringBuilder();
             * (new CSVWriter()).write(sb, tables);
             * String s = sb.toString();
             * assertEquals(expectedCsv, s);
             */

            using (var stream = new MemoryStream())
                using (var sb = new StreamWriter(stream)
                {
                    AutoFlush = true
                })
                {
                    (new CSVWriter()).Write(sb, tables);
                    var reader = new StreamReader(stream);
                    stream.Position = 0;
                    var s = reader.ReadToEnd().Trim(); // trim to remove last new line
                    Assert.Equal(expectedCsv, s);
                }
        }
Esempio n. 4
0
        public void TestCSVSerializeInfinity()
        {
            string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/schools.csv");
            // top,    left,   bottom,  right              // page height = 612
            // 53.74f, 16.97f, 548.74f, 762.3f)

            PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/schools.pdf", new PdfRectangle(16.97, 612 - 548.74, 762.3, 612 - 53.74 - 1)); // remove 1 because add an empty line at the top if not
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            Table table = sea.Extract(page)[0];

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, table);
            string s = sb.ToString();

            Assert.Equal(expectedCsv.Trim(), s.Replace("\r\n", "\n"));

            /*
             * using (var stream = new MemoryStream())
             * using (var sb = new StreamWriter(stream) { AutoFlush = true })
             * {
             *  (new CSVWriter()).write(sb, table);
             *  var reader = new StreamReader(stream);
             *  stream.Position = 0;
             *  var s = reader.ReadToEnd().Trim(); // trim to remove last new line
             *  Assert.Equal(expectedCsv, s);
             * }
             */
        }
Esempio n. 5
0
        public void TestSpanningCellsToCsv()
        {
            PageArea page        = UtilsForTesting.GetPage("Resources/spanning_cells.pdf", 1);
            string   expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/spanning_cells.csv");
            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = se.Extract(page);

            Assert.Equal(2, tables.Count);

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables);
            Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n").Trim());
        }
Esempio n. 6
0
        public void StreamNoGuess1()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/data.pdf", 1);
            // data_stream_noguess.csv was modified for decimal precision
            string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/data_stream_noguess.csv");

            BasicExtractionAlgorithm se = new BasicExtractionAlgorithm();

            List <Table> tables = se.Extract(page);

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n"));
        }
Esempio n. 7
0
        public void TestSpreadsheetWithNoBoundingFrameShouldBeSpreadsheet()
        {
            PageArea page        = UtilsForTesting.GetAreaFromPage("Resources/spreadsheet_no_bounding_frame.pdf", 1, new PdfRectangle(58.9, 842 - 654.7, 536.12, 842 - 150.56)); // 842 - 150.56)); // 150.56f, 58.9f, 654.7f, 536.12f);
            string   expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/spreadsheet_no_bounding_frame.csv");

            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            bool isTabular = se.IsTabular(page);

            Assert.True(isTabular);
            List <Table> tables = se.Extract(page);

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            Assert.Equal(expectedCsv, sb.ToString());
        }
Esempio n. 8
0
        public void StreamNoGuess1()
        {
            // tabula.read_pdf(pdf_path, stream=True, guess=False)

            PageArea page = UtilsForTesting.GetPage("Resources/data.pdf", 1);
            //
            string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/data_stream_noguess.csv");

            BasicExtractionAlgorithm se = new BasicExtractionAlgorithm();

            List <Table> tables = se.Extract(page);

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n"));
        }
Esempio n. 9
0
        public void Latice1()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/data.pdf", 1);
            // data_lattice.csv was modified to add the last row, missing in tabula_py
            string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/data_lattice.csv");

            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();

            List <Table> tables = se.Extract(page);

            Assert.Single(tables);

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n"));
        }
Esempio n. 10
0
        public void TestTableWithMultilineHeader()
        {
            string   expectedCsv         = UtilsForTesting.LoadCsv("Resources/csv/us-020.csv");
            PageArea page                = UtilsForTesting.GetAreaFromPage("Resources/us-020.pdf", 2, new PdfRectangle(35.0, 151, 560, 688.5));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
            Table table = bea.Extract(page)[0];

            using (var stream = new MemoryStream())
                using (var sb = new StreamWriter(stream)
                {
                    AutoFlush = true
                })
                {
                    (new CSVWriter()).Write(sb, table);

                    var reader = new StreamReader(stream);
                    stream.Position = 0;
                    var data = reader.ReadToEnd().Replace("\r\n", "\n").Trim(); // trim to remove last new line

                    Assert.Equal(expectedCsv, data);
                }
        }
Esempio n. 11
0
        public void TestRealLifeRTL2()
        {
            string   expectedCsv         = UtilsForTesting.LoadCsv("Resources/csv/indictb1h_14.csv");
            PageArea page                = UtilsForTesting.GetAreaFromPage("Resources/indictb1h_14.pdf", 1, new PdfRectangle(120.0, 842 - 622.82, 459.9, 842 - 120.0));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
            Table table = bea.Extract(page)[0];

            using (var stream = new MemoryStream())
                using (var sb = new StreamWriter(stream)
                {
                    AutoFlush = true
                })
                {
                    (new CSVWriter()).Write(sb, table);

                    var reader = new StreamReader(stream);
                    stream.Position = 0;
                    var data = reader.ReadToEnd().Replace("\r\n", "\n").Trim(); // trim to remove last new line

                    Assert.Equal(expectedCsv, data);
                }
        }
Esempio n. 12
0
        public void TestSpreadsheetExtractionIssue656()
        {
            // page height = 482, width 762.3 // 612
            // top,     left,    bottom,   right
            // 56.925f, 24.255f, 549.945f, 786.555f);
            PageArea page        = UtilsForTesting.GetAreaFromFirstPage("Resources/Publication_of_award_of_Bids_for_Transport_Sector__August_2016.pdf", new PdfRectangle(24.255, 71, 786.555, 553));
            string   expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/Publication_of_award_of_Bids_for_Transport_Sector__August_2016.csv");

            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = sea.Extract(page);

            Assert.Single(tables);
            Table table = tables[0];

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, table);
            string result = sb.ToString();

            Assert.Equal(expectedCsv.Replace("\n", "\r"), result.Replace("\r\n", "\n").Replace("\n", "\r").Trim());

            /*
             * using (var stream = new MemoryStream())
             * using (var sb = new StreamWriter(stream) { AutoFlush = true })
             * {
             *  (new CSVWriter()).write(sb, table);
             *
             *  var reader = new StreamReader(stream);
             *  stream.Position = 0;
             *  var s = reader.ReadToEnd().Trim(); // trim to remove last new line
             *
             *  // is there an issue with \r and \n?
             *  Assert.Equal(expectedCsv.Replace("\n", "\r"), s.Replace("\r\n", "\n").Replace("\n", "\r"));
             * }
             */
        }