Beispiel #1
0
        public void TestCSVMultilineRow()
        {
            string   expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/frx_2012_disclosure.csv");
            PageArea page        = UtilsForTesting.GetAreaFromFirstPage("Resources/frx_2012_disclosure.pdf", new PdfRectangle(double.NaN, double.NaN, double.NaN, double.NaN)); // 53.0f, 49.0f, 735.0f, 550.0f);
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            Table table = sea.Extract(page)[0];

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, table);
            string s = sb.ToString();

            Assert.Equal(expectedCsv, s);

            /*
             * using (var stream = new MemoryStream())
             * using (var sb = new StreamWriter(stream) { AutoFlush = true })
             * {
             *  (new CSVWriter()).write(sb, table);
             *  var reader = new StreamReader(stream);
             *  stream.Position = 0;
             *  var s = reader.ReadToEnd().Trim(); // trim to remove last new line
             *  Assert.Equal(expectedCsv, s);
             * }
             */
        }
Beispiel #2
0
        private Table GetTable()
        {
            PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/argentina_diputados_voting_record.pdf", new PdfRectangle(12.75, 55.0, 561, 567)); // 269.875f, 12.75f, 790.5f, 561f);
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();

            return(bea.Extract(page)[0]);
        }
Beispiel #3
0
        public void TestCSVSerializeInfinity()
        {
            string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/schools.csv");
            // top,    left,   bottom,  right              // page height = 612
            // 53.74f, 16.97f, 548.74f, 762.3f)

            PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/schools.pdf", new PdfRectangle(16.97, 612 - 548.74, 762.3, 612 - 53.74 - 1)); // remove 1 because add an empty line at the top if not
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            Table table = sea.Extract(page)[0];

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, table);
            string s = sb.ToString();

            Assert.Equal(expectedCsv.Trim(), s.Replace("\r\n", "\n"));

            /*
             * using (var stream = new MemoryStream())
             * using (var sb = new StreamWriter(stream) { AutoFlush = true })
             * {
             *  (new CSVWriter()).write(sb, table);
             *  var reader = new StreamReader(stream);
             *  stream.Position = 0;
             *  var s = reader.ReadToEnd().Trim(); // trim to remove last new line
             *  Assert.Equal(expectedCsv, s);
             * }
             */
        }
        public void TestRemoveSequentialSpaces()
        {
            PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/m27.pdf", new PdfRectangle(28.28, 532 - (103.04 - 79.2), 732.6, 532));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
            Table table    = bea.Extract(page)[0];
            var   firstRow = table.Rows[0];

            Assert.Equal("ALLEGIANT AIR", firstRow[1].GetText());
            Assert.Equal("ALLEGIANT AIR LLC", firstRow[2].GetText());
        }
Beispiel #5
0
        public void TestExtractColumnsCorrectly3()
        {
            // top,     left,   bottom,  right
            // 106.01f, 48.09f, 227.31f, 551.89f
            // bottom = 792 - 227.31 = 564.69
            // top =  792 - 106.01 = 685.99
            PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/frx_2012_disclosure.pdf", new PdfRectangle(48.09, 564.69, 551.89, 684.99)); // changed 685.99 to 684.99 because was adding an empty row at the top
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            Table table = sea.Extract(page)[0];

            Assert.Equal("REGIONAL PULMONARY & SLEEP\rMEDICINE", table.Rows[8][1].GetText());
        }
        public void TestCheckSqueezeDoesntBreak()
        {
            PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/12s0324.pdf", new PdfRectangle(17.25, 342, 410.25, 560.5));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
            Table table             = bea.Extract(page)[0];
            var   rows              = table.Rows;
            var   firstRow          = rows[0];
            var   firstRowFirstCell = firstRow[0].GetText();
            var   lastRow           = rows[rows.Count - 1];
            var   lastRowLastCell   = lastRow[lastRow.Count - 1].GetText();

            Assert.Equal("Violent crime  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .", firstRowFirstCell);
            Assert.Equal("(X)", lastRowLastCell);
        }
Beispiel #7
0
        public void TestJSONSerializeInfinity()
        {
            string   expectedJson = UtilsForTesting.LoadJson("Resources/json/schools.json");
            PageArea page         = UtilsForTesting.GetAreaFromFirstPage("Resources/schools.pdf", new PdfRectangle(double.NaN, double.NaN, double.NaN, double.NaN)); // 53.74f, 16.97f, 548.74f, 762.3f);
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            Table table = sea.Extract(page)[0];                                                                                                                      //.get(0);

            StringBuilder sb = new StringBuilder();

            (new JSONWriter()).Write(sb, table);
            string s = sb.ToString();

            Assert.Equal(expectedJson, s);
        }
        public void TestVerticalRulingsPreventMergingOfColumns()
        {
            List <Ruling> rulings = new List <Ruling>();

            double[] rulingsVerticalPositions = { 147, 256, 310, 375, 431, 504 };
            for (int i = 0; i < 6; i++)
            {
                rulings.Add(new Ruling(new PdfPoint(rulingsVerticalPositions[i], 40.43), new PdfPoint(rulingsVerticalPositions[i], 755)));
            }

            PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/campaign_donors.pdf", new PdfRectangle(40.43, 755 - (398.76 - 255.57), 557.35, 755));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(rulings);
            Table table    = bea.Extract(page)[0];
            var   sixthRow = table.Rows[5];

            Assert.Equal("VALSANGIACOMO BLANC", sixthRow[0].GetText());
            Assert.Equal("OFERNANDO JORGE", sixthRow[1].GetText());
        }
        public void TestExtractColumnsCorrectly3()
        {
            PageArea page = UtilsForTesting.GetAreaFromFirstPage(FRX_2012_DISCLOSURE_PDF, new PdfRectangle(48.09, 563, 551.89, 685.5));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
            Table table  = bea.Extract(page)[0];
            var   result = UtilsForTesting.TableToArrayOfRows(table);

            Assert.Equal(FRX_2012_DISCLOSURE_EXPECTED.Length, result.Length);
            for (int i = 0; i < FRX_2012_DISCLOSURE_EXPECTED.Length; i++)
            {
                var expecteds = FRX_2012_DISCLOSURE_EXPECTED[i];
                var actuals   = result[i];
                Assert.Equal(expecteds.Length, actuals.Length);
                for (int j = 0; j < expecteds.Length; j++)
                {
                    var e = expecteds[j];
                    var a = actuals[j];
                    Assert.Equal(e, a);
                }
            }
        }
        public void TestColumnRecognition()
        {
            PageArea page = UtilsForTesting.GetAreaFromFirstPage(ARGENTINA_DIPUTADOS_VOTING_RECORD_PDF, new PdfRectangle(12.75, 55, 557, 567));

            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
            Table table   = bea.Extract(page)[0];
            var   results = UtilsForTesting.TableToArrayOfRows(table);

            Assert.Equal(ARGENTINA_DIPUTADOS_VOTING_RECORD_EXPECTED.Length, results.Length);

            for (int i = 0; i < ARGENTINA_DIPUTADOS_VOTING_RECORD_EXPECTED.Length; i++)
            {
                var expected = ARGENTINA_DIPUTADOS_VOTING_RECORD_EXPECTED[i];
                var result   = results[i];
                Assert.Equal(expected.Length, result.Length);
                for (int j = 0; j < expected.Length; j++)
                {
                    var e = expected[j];
                    var r = result[j];
                    Assert.Equal(e, r);
                }
            }
        }
Beispiel #11
0
        public void TestSpreadsheetExtractionIssue656()
        {
            // page height = 482, width 762.3 // 612
            // top,     left,    bottom,   right
            // 56.925f, 24.255f, 549.945f, 786.555f);
            PageArea page        = UtilsForTesting.GetAreaFromFirstPage("Resources/Publication_of_award_of_Bids_for_Transport_Sector__August_2016.pdf", new PdfRectangle(24.255, 71, 786.555, 553));
            string   expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/Publication_of_award_of_Bids_for_Transport_Sector__August_2016.csv");

            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = sea.Extract(page);

            Assert.Single(tables);
            Table table = tables[0];

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, table);
            string result = sb.ToString();

            Assert.Equal(expectedCsv.Replace("\n", "\r"), result.Replace("\r\n", "\n").Replace("\n", "\r").Trim());

            /*
             * using (var stream = new MemoryStream())
             * using (var sb = new StreamWriter(stream) { AutoFlush = true })
             * {
             *  (new CSVWriter()).write(sb, table);
             *
             *  var reader = new StreamReader(stream);
             *  stream.Position = 0;
             *  var s = reader.ReadToEnd().Trim(); // trim to remove last new line
             *
             *  // is there an issue with \r and \n?
             *  Assert.Equal(expectedCsv.Replace("\n", "\r"), s.Replace("\r\n", "\n").Replace("\n", "\r"));
             * }
             */
        }