예제 #1
0
        private List <Table> GetTables()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/twotables.pdf", 1);
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();

            return(sea.Extract(page));
        }
예제 #2
0
        public void TestCSVMultilineRow()
        {
            string   expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/frx_2012_disclosure.csv");
            PageArea page        = UtilsForTesting.GetAreaFromFirstPage("Resources/frx_2012_disclosure.pdf", new PdfRectangle(double.NaN, double.NaN, double.NaN, double.NaN)); // 53.0f, 49.0f, 735.0f, 550.0f);
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            Table table = sea.Extract(page)[0];

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, table);
            string s = sb.ToString();

            Assert.Equal(expectedCsv, s);

            /*
             * using (var stream = new MemoryStream())
             * using (var sb = new StreamWriter(stream) { AutoFlush = true })
             * {
             *  (new CSVWriter()).write(sb, table);
             *  var reader = new StreamReader(stream);
             *  stream.Position = 0;
             *  var s = reader.ReadToEnd().Trim(); // trim to remove last new line
             *  Assert.Equal(expectedCsv, s);
             * }
             */
        }
예제 #3
0
        public void TestRTL()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/arabic.pdf", 1);
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = sea.Extract(page);
            // Assert.Equal(1, tables.size());
            Table table = tables[0];

            var rows = table.Rows;

            Assert.Equal("اسمي سلطان", rows[1][1].GetText());
            Assert.Equal("من اين انت؟", rows[2][1].GetText());
            Assert.Equal("1234", rows[3][0].GetText());
            Assert.Equal("هل انت شباك؟", rows[4][0].GetText());
            Assert.Equal("انا من ولاية كارولينا الشمال", rows[2][0].GetText()); // conjoined lam-alif gets missed
            Assert.Equal("اسمي Jeremy في الانجليزية", rows[4][1].GetText());    // conjoined lam-alif gets missed
            Assert.Equal("عندي 47 قطط", rows[3][1].GetText());                  // the real right answer is 47.
            Assert.Equal("Jeremy is جرمي in Arabic", rows[5][0].GetText());     // the real right answer is 47.
            Assert.Equal("مرحباً", rows[1][0].GetText());                       // really ought to be ً, but this is forgiveable for now

            // there is one remaining problems that are not yet addressed
            // - diacritics (e.g. Arabic's tanwinً and probably Hebrew nekudot) are put in the wrong place.
            // this should get fixed, but this is a good first stab at the problem.

            // these (commented-out) tests reflect the theoretical correct answer,
            // which is not currently possible because of the two problems listed above
            // Assert.Equal("مرحباً",                       table.getRows()[0][0].getText()); // really ought to be ً, but this is forgiveable for now
        }
예제 #4
0
        public void TestExtractSpreadsheetWithinAnArea()
        {
            PageArea page = UtilsForTesting.GetAreaFromPage("Resources/puertos1.pdf", 1, new PdfRectangle(30.32142857142857, 793 - 554.8821428571429, 546.7964285714286, 793 - 273.9035714285714)); // 273.9035714285714f, 30.32142857142857f, 554.8821428571429f, 546.7964285714286f);
            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = se.Extract(page);
            Table        table  = tables[0];

            Assert.Equal(15, table.Rows.Count);

            const string expected = "\"\",TM,M.U$S,TM,M.U$S,TM,M.U$S,TM,M.U$S,TM,M.U$S,TM,M.U$S,TM\n" +
                                    "Peces vivos,1,25,1,23,2,38,1,37,2,67,2,89,1\n" +
                                    "\"Pescado fresco\n" +
                                    "o refrigerado.\n" +
                                    "exc. filetes\",7.704,7.175,8.931,6.892,12.635,10.255,16.742,13.688,14.357,11.674,13.035,13.429,9.727\n" +
                                    "\"Pescado congelado\n" +
                                    "exc. filetes\",90.560,105.950,112.645,108.416,132.895,115.874,152.767,133.765,148.882,134.847,156.619,165.134,137.179\n" +
                                    "\"Filetes y demás car-\n" +
                                    "nes de pescado\",105.434,200.563,151.142,218.389,152.174,227.780,178.123,291.863,169.422,313.735,176.427,381.640,144.814\n" +
                                    "\"Pescado sec./sal./\n" +
                                    "en salm. har./pol./\n" +
                                    "pell. aptos\n" +
                                    "p/c humano\",6.837,14.493,6.660,9.167,14.630,17.579,18.150,21.302,18.197,25.739,13.460,23.549,11.709\n" +
                                    "Crustáceos,61.691,375.798,52.488,251.043,47.635,387.783,27.815,217.443,7.123,86.019,39.488,373.583,45.191\n" +
                                    "Moluscos,162.027,174.507,109.436,111.443,90.834,104.741,57.695,109.141,98.182,206.304,187.023,251.352,157.531\n" +
                                    "\"Prod. no exp. en\n" +
                                    "otros capítulos.\n" +
                                    "No apto p/c humano\",203,328,7,35,521,343,\"1,710\",\"1,568\",125,246,124,263,131\n" +
                                    "\"Grasas y aceites de\n" +
                                    "pescado y mamíferos\n" +
                                    "marinos\",913,297,\"1,250\",476,\"1,031\",521,\"1,019\",642,690,483,489,710,959\n" +
                                    "\"Extractos y jugos de\n" +
                                    "pescado y mariscos\",5,25,1,3,4,4,31,93,39,117,77,230,80\n" +
                                    "\"Preparaciones y con-\n" +
                                    "servas de pescado\",846,\"3,737\",\"1,688\",\"4,411\",\"1,556\",\"3,681\",\"2,292\",\"5,474\",\"2,167\",\"7,494\",\"2,591\",\"8,833\",\"2,795\"\n" +
                                    "\"Preparaciones y con-\n" +
                                    "servas de mariscos\",348,\"3,667\",345,\"1,771\",738,\"3,627\",561,\"2,620\",607,\"3,928\",314,\"2,819\",250\n" +
                                    "\"Harina, polvo y pe-\n" +
                                    "llets de pescado.No\n" +
                                    "aptos p/c humano\",\"16,947\",\"8,547\",\"11,867\",\"6,315\",\"32,528\",\"13,985\",\"37,313\",\"18,989\",\"35,787\",\"19,914\",\"37,821\",\"27,174\",\"30,000\"\n" +
                                    "TOTAL,\"453,515\",\"895,111\",\"456,431\",\"718,382\",\"487,183\",\"886,211\",\"494,220\",\"816,623\",\"495,580\",\"810,565\",\"627,469\",\"1,248,804\",\"540,367\"\n";

            // TODO add better assertions
            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            string result = sb.ToString();

            //List<CSVRecord> parsedExpected = org.apache.commons.csv.CSVParser.parse(expected, CSVFormat.EXCEL).getRecords();
            //List<CSVRecord> parsedResult = org.apache.commons.csv.CSVParser.parse(result, CSVFormat.EXCEL).getRecords();
            using (var csv = new CsvReader(new StreamReader(new MemoryStream(Encoding.ASCII.GetBytes(result))), CultureInfo.InvariantCulture))
            {
                /*
                 * Assert.Equal(parsedResult.Count, parsedExpected.Count);
                 * for (int i = 0; i < parsedResult.Count; i++)
                 * {
                 *  Assert.Equal(parsedResult[i].size(), parsedExpected[i].size());
                 * }
                 */
            }
        }
예제 #5
0
        public void TestExtractTableWithExternallyDefinedRulings()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/us-007.pdf", 1);
            SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = bea.Extract(page, EXTERNALLY_DEFINED_RULINGS.ToList());

            Assert.Single(tables);
            Table table = tables[0];

            Assert.Equal(18, table.Cells.Count);

            var rows = table.Rows;

            Assert.Equal("Payroll Period", rows[0][0].GetText());
            Assert.Equal("One Withholding\rAllowance", rows[0][1].GetText());
            Assert.Equal("Weekly", rows[1][0].GetText());
            Assert.Equal("$71.15", rows[1][1].GetText());
            Assert.Equal("Biweekly", rows[2][0].GetText());
            Assert.Equal("142.31", rows[2][1].GetText());
            Assert.Equal("Semimonthly", rows[3][0].GetText());
            Assert.Equal("154.17", rows[3][1].GetText());
            Assert.Equal("Monthly", rows[4][0].GetText());
            Assert.Equal("308.33", rows[4][1].GetText());
            Assert.Equal("Quarterly", rows[5][0].GetText());
            Assert.Equal("925.00", rows[5][1].GetText());
            Assert.Equal("Semiannually", rows[6][0].GetText());
            Assert.Equal("1,850.00", rows[6][1].GetText());
            Assert.Equal("Annually", rows[7][0].GetText());
            Assert.Equal("3,700.00", rows[7][1].GetText());
            Assert.Equal("Daily or Miscellaneous\r(each day of the payroll period)", rows[8][0].GetText());
            Assert.Equal("14.23", rows[8][1].GetText());
        }
예제 #6
0
        public void TestCSVSerializeInfinity()
        {
            string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/schools.csv");
            // top,    left,   bottom,  right              // page height = 612
            // 53.74f, 16.97f, 548.74f, 762.3f)

            PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/schools.pdf", new PdfRectangle(16.97, 612 - 548.74, 762.3, 612 - 53.74 - 1)); // remove 1 because add an empty line at the top if not
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            Table table = sea.Extract(page)[0];

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, table);
            string s = sb.ToString();

            Assert.Equal(expectedCsv.Trim(), s.Replace("\r\n", "\n"));

            /*
             * using (var stream = new MemoryStream())
             * using (var sb = new StreamWriter(stream) { AutoFlush = true })
             * {
             *  (new CSVWriter()).write(sb, table);
             *  var reader = new StreamReader(stream);
             *  stream.Position = 0;
             *  var s = reader.ReadToEnd().Trim(); // trim to remove last new line
             *  Assert.Equal(expectedCsv, s);
             * }
             */
        }
예제 #7
0
        public void TestRealLifeRTL()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/mednine.pdf", 1);
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = sea.Extract(page);

            Assert.Single(tables);
            Table table = tables[0];
            var   rows  = table.Rows;

            Assert.Equal("الانتخابات التشريعية  2014", rows[0][0].GetText()); // the doubled spaces might be a bug in my implementation. // bobld: missing space or worng words order
            Assert.Equal("ورقة كشف نتائج دائرة مدنين", rows[1][0].GetText());
            Assert.Equal("426", rows[4][0].GetText());
            Assert.Equal("63", rows[4][1].GetText());
            Assert.Equal("43", rows[4][2].GetText());
            Assert.Equal("56", rows[4][3].GetText());
            Assert.Equal("58", rows[4][4].GetText());
            Assert.Equal("49", rows[4][5].GetText());
            Assert.Equal("55", rows[4][6].GetText());
            Assert.Equal("33", rows[4][7].GetText());
            Assert.Equal("32", rows[4][8].GetText());
            Assert.Equal("37", rows[4][9].GetText());
            Assert.Equal("قائمة من أجل تحقيق سلطة الشعب", rows[4][10].GetText());

            // there is one remaining problems that are not yet addressed
            // - diacritics (e.g. Arabic's tanwinً and probably Hebrew nekudot) are put in the wrong place.
            // this should get fixed, but this is a good first stab at the problem.

            // these (commented-out) tests reflect the theoretical correct answer,
            // which is not currently possible because of the two problems listed above
            //Assert.Equal("مرحباً", rows[0][0].getText()); // really ought to be ً, but this is forgiveable for now
        }
예제 #8
0
        public void Eu004()
        {
            using (PdfDocument document = PdfDocument.Open("Resources/icdar2013-dataset/competition-dataset-eu/eu-004.pdf", new ParsingOptions()
            {
                ClipPaths = true
            }))
            {
                ObjectExtractor oe   = new ObjectExtractor(document);
                PageArea        page = oe.Extract(3);

                var detector = new SimpleNurminenDetectionAlgorithm();
                var regions  = detector.Detect(page);

                var newArea = page.GetArea(regions[0].BoundingBox);

                var sea    = new SpreadsheetExtractionAlgorithm();
                var tables = sea.Extract(newArea);

                /*
                 * var detector = new SimpleNurminenDetectionAlgorithm();
                 * var regions = detector.Detect(page);
                 *
                 * foreach (var a in regions)
                 * {
                 *  IExtractionAlgorithm ea = new BasicExtractionAlgorithm();
                 *  var newArea = page.GetArea(a.BoundingBox);
                 *  List<Table> tables = ea.Extract(newArea);
                 * }
                 */
            }
        }
예제 #9
0
        public void TestSpanningCells()
        {
            PageArea page                     = UtilsForTesting.GetPage("Resources/spanning_cells.pdf", 1);
            string   expectedJson             = UtilsForTesting.LoadJson("Resources/json/spanning_cells.json");
            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            List <Table> tables               = se.Extract(page);

            Assert.Equal(2, tables.Count);

            var expectedJObject = (JArray)JsonConvert.DeserializeObject(expectedJson);

            StringBuilder sb = new StringBuilder();

            (new JSONWriter()).Write(sb, tables);
            var actualJObject = (JArray)JsonConvert.DeserializeObject(sb.ToString());

            double pageHeight = 842;
            double precision  = 2;

            for (int i = 0; i < 2; i++)
            {
                Assert.Equal(expectedJObject[i]["extraction_method"], actualJObject[i]["extraction_method"]);

                Assert.True(Math.Abs(Math.Floor(pageHeight - expectedJObject[i]["top"].Value <double>()) - Math.Floor(actualJObject[i]["top"].Value <double>())) < precision);
                Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["left"].Value <double>()) - Math.Floor(actualJObject[i]["left"].Value <double>())) < precision);
                Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["width"].Value <double>()) - Math.Floor(actualJObject[i]["width"].Value <double>())) < precision);
                Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["height"].Value <double>()) - Math.Floor(actualJObject[i]["height"].Value <double>())) < precision);
                Assert.True(Math.Abs(Math.Floor(expectedJObject[i]["right"].Value <double>()) - Math.Floor(actualJObject[i]["right"].Value <double>())) < precision);
                Assert.True(Math.Abs(Math.Floor(pageHeight - expectedJObject[i]["bottom"].Value <double>()) - Math.Floor(actualJObject[i]["bottom"].Value <double>())) < precision);

                var expectedData = (JArray)expectedJObject[i]["data"];
                var actualData   = (JArray)actualJObject[i]["data"];
                Assert.Equal(expectedData.Count, actualData.Count);

                for (int r = 0; r < expectedData.Count; r++)
                {
                    var rowExpected = (JArray)expectedData[r];
                    var rowActual   = (JArray)actualData[r];
                    Assert.Equal(rowExpected.Count, rowActual.Count);

                    for (int c = 0; c < rowExpected.Count; c++)
                    {
                        var cellExpected = (JObject)rowExpected[c];
                        var cellActual   = (JObject)rowActual[c];

                        if (string.IsNullOrEmpty(cellExpected["text"].Value <string>()))
                        {
                            continue;                                                             // empty cell have no coordinate data???
                        }
                        Assert.True(Math.Abs(Math.Floor(pageHeight - cellExpected["top"].Value <double>()) - Math.Floor(cellActual["top"].Value <double>())) < precision);
                        Assert.True(Math.Abs(Math.Floor(cellExpected["left"].Value <double>()) - Math.Floor(cellActual["left"].Value <double>())) < precision);
                        Assert.True(Math.Abs(Math.Floor(cellExpected["width"].Value <double>()) - Math.Floor(cellActual["width"].Value <double>())) < precision);
                        Assert.True(Math.Abs(Math.Floor(cellExpected["height"].Value <double>()) - Math.Floor(cellActual["height"].Value <double>())) < precision);
                        Assert.Equal(cellExpected["text"].Value <string>(), cellActual["text"].Value <string>());
                    }
                }
            }
            //Assert.Equal(expectedJson, sb.ToString());
        }
예제 #10
0
        public void TestIncompleteGrid()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/china.pdf", 1);
            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = se.Extract(page);

            Assert.Equal(2, tables.Count);
        }
예제 #11
0
        public void TestDetectTwoSingleCells()
        {
            List <Cell> cells = SpreadsheetExtractionAlgorithm.FindCells(TWO_SINGLE_CELL_RULINGS[0].ToList(), TWO_SINGLE_CELL_RULINGS[1].ToList());

            Assert.Equal(2, cells.Count);
            // should not overlap
            Assert.False(cells[0].Intersects(cells[1]));
        }
예제 #12
0
        public void TestShouldDetectASingleSpreadsheet()
        {
            PageArea page = UtilsForTesting.GetAreaFromPage("Resources/offense.pdf", 1, new PdfRectangle(16.44, 792 - 680.85, 597.84, 792 - 16.44)); // 68.08f, 16.44f, 680.85f, 597.84f);
            SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = bea.Extract(page);

            Assert.Single(tables);
        }
예제 #13
0
        /// <summary>
        /// Detects the tables in the page.
        /// </summary>
        /// <param name="page">The page where to detect the tables.</param>
        public List<TableRectangle> Detect(PageArea page)
        {
            List<Cell> cells = SpreadsheetExtractionAlgorithm.FindCells(page.HorizontalRulings, page.VerticalRulings);

            List<TableRectangle> tables = SpreadsheetExtractionAlgorithm.FindSpreadsheetsFromCells(cells.Cast<TableRectangle>().ToList());

            // we want tables to be returned from top to bottom on the page
            Utils.Sort(tables, new TableRectangle.ILL_DEFINED_ORDER());
            return tables;
        }
예제 #14
0
        public void TestAnotherExtractTableWithExternallyDefinedRulings()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/us-024.pdf", 1);
            SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = bea.Extract(page, EXTERNALLY_DEFINED_RULINGS2.ToList());

            Assert.Single(tables);
            Table table = tables[0];

            Assert.Equal("Total Supply", table.Rows[4][0].GetText());
            Assert.Equal("6.6", table.Rows[6][2].GetText());
        }
예제 #15
0
        public void TestSpreadsheetsSortedByTopAndRight()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/sydney_disclosure_contract.pdf", 1);

            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = sea.Extract(page);

            for (int i = 1; i < tables.Count; i++)
            {
                Assert.True(tables[i - 1].Top >= tables[i].Top); // Assert.True(tables[i - 1].getTop() <= tables[i].getTop());
            }
        }
예제 #16
0
        public void TestDetectSingleCell()
        {
            List <Cell> cells = SpreadsheetExtractionAlgorithm.FindCells(SINGLE_CELL_RULINGS[0].ToList(), SINGLE_CELL_RULINGS[1].ToList());

            Assert.Single(cells);
            Cell cell = cells[0];

            Assert.True(Utils.Feq(151.65355, cell.Left));
            Assert.True(Utils.Feq(185.6693, cell.Bottom)); // .getTop()
            Assert.True(Utils.Feq(229.08083, cell.Width));
            Assert.True(Utils.Feq(128.97636, cell.Height));
        }
예제 #17
0
        public void TestExtractColumnsCorrectly3()
        {
            // top,     left,   bottom,  right
            // 106.01f, 48.09f, 227.31f, 551.89f
            // bottom = 792 - 227.31 = 564.69
            // top =  792 - 106.01 = 685.99
            PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/frx_2012_disclosure.pdf", new PdfRectangle(48.09, 564.69, 551.89, 684.99)); // changed 685.99 to 684.99 because was adding an empty row at the top
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            Table table = sea.Extract(page)[0];

            Assert.Equal("REGIONAL PULMONARY & SLEEP\rMEDICINE", table.Rows[8][1].GetText());
        }
예제 #18
0
        public void TestDontStackOverflowQuicksort()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/failing_sort.pdf", 1);

            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = sea.Extract(page);

            for (int i = 1; i < tables.Count; i++)
            {
                Assert.True(tables[i - 1].Top >= tables[i].Top); //Assert.True(tables[i - 1].getTop() <= tables[i].getTop());
            }
        }
예제 #19
0
        public void TestSpanningCellsToCsv()
        {
            PageArea page        = UtilsForTesting.GetPage("Resources/spanning_cells.pdf", 1);
            string   expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/spanning_cells.csv");
            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = se.Extract(page);

            Assert.Equal(2, tables.Count);

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables);
            Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n").Trim());
        }
예제 #20
0
        public void TestJSONSerializeInfinity()
        {
            string   expectedJson = UtilsForTesting.LoadJson("Resources/json/schools.json");
            PageArea page         = UtilsForTesting.GetAreaFromFirstPage("Resources/schools.pdf", new PdfRectangle(double.NaN, double.NaN, double.NaN, double.NaN)); // 53.74f, 16.97f, 548.74f, 762.3f);
            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            Table table = sea.Extract(page)[0];                                                                                                                      //.get(0);

            StringBuilder sb = new StringBuilder();

            (new JSONWriter()).Write(sb, table);
            string s = sb.ToString();

            Assert.Equal(expectedJson, s);
        }
예제 #21
0
        public void TestNaturalOrderOfRectanglesDoesNotBreakContract()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/us-017.pdf", 2);
            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = se.Extract(page);

            string expected = "Project,Agency,Institution\r\nNanotechnology and its publics,NSF,Pennsylvania State University\r\n\"Public information and deliberation in nanoscience and\rnanotechnology policy (SGER)\",Interagency,\"North Carolina State\rUniversity\"\r\n\"Social and ethical research and education in agrifood\rnanotechnology (NIRT)\",NSF,Michigan State University\r\n\"From laboratory to society: developing an informed\rapproach to nanoscale science and engineering (NIRT)\",NSF,University of South Carolina\r\nDatabase and innovation timeline for nanotechnology,NSF,UCLA\r\nSocial and ethical dimensions of nanotechnology,NSF,University of Virginia\r\n\"Undergraduate exploration of nanoscience,\rapplications and societal implications (NUE)\",NSF,\"Michigan Technological\rUniversity\"\r\n\"Ethics and belief inside the development of\rnanotechnology (CAREER)\",NSF,University of Virginia\r\n\"All centers, NNIN and NCN have a societal\rimplications components\",\"NSF, DOE,\rDOD, and NIH\",\"All nanotechnology centers\rand networks\""; // \r\n

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            string result = sb.ToString().Trim();

            Assert.Equal(expected.Replace("\r\n", "\r"), result.Replace("\r\n", "\n").Replace("\n", "\r"));
        }
예제 #22
0
        public void TestSpreadsheetWithNoBoundingFrameShouldBeSpreadsheet()
        {
            PageArea page        = UtilsForTesting.GetAreaFromPage("Resources/spreadsheet_no_bounding_frame.pdf", 1, new PdfRectangle(58.9, 842 - 654.7, 536.12, 842 - 150.56)); // 842 - 150.56)); // 150.56f, 58.9f, 654.7f, 536.12f);
            string   expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/spreadsheet_no_bounding_frame.csv");

            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
            bool isTabular = se.IsTabular(page);

            Assert.True(isTabular);
            List <Table> tables = se.Extract(page);

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            Assert.Equal(expectedCsv, sb.ToString());
        }
예제 #23
0
        public void Latice1()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/data.pdf", 1);
            // data_lattice.csv was modified to add the last row, missing in tabula_py
            string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/data_lattice.csv");

            SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();

            List <Table> tables = se.Extract(page);

            Assert.Single(tables);

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n"));
        }
예제 #24
0
        /// <summary>
        /// 通过PDF文档对象解析PDF
        /// </summary>
        /// <param name="pdfDocument">PDF文档</param>
        /// <param name="tableContainType">表格包含样式</param>
        /// <returns></returns>
        public static PDFModel Parser(PDDocument pdfDocument, TableContainType tableContainType)
        {
            ObjectExtractor extractor    = new ObjectExtractor(pdfDocument);
            PageIterator    pageIterator = extractor.extract();
            SpreadsheetExtractionAlgorithm tableExtractor = new SpreadsheetExtractionAlgorithm();

            PDFModel pdfModel = new PDFModel();

            PDFTextStripper pdfStripper = new PDFTextStripper();

            pdfStripper.setPageEnd(pageEndMark);
            //pdfStripper.setParagraphEnd(paragraphEndMark);
            string[] strs = Regex.Split(pdfStripper.getText(pdfDocument), pageEndMark, RegexOptions.IgnoreCase);
            if (strs != null && strs.Length > 0)
            {
                pdfModel.Pages = new List <PdfPageModel>();
                int cp = 0;

                while (pageIterator.hasNext())
                {
                    PdfPageModel pdfPage = new PdfPageModel();
                    pdfPage.CurrentPage = cp + 1;
                    pdfPage.Text        = strs[cp];

                    List <Table> tables     = new List <Table>();
                    Page         page       = pageIterator.next();
                    var          pageTables = tableExtractor.extract(page).toArray();
                    if (pageTables != null && pageTables.Length > 0)
                    {
                        for (int i = 0; i < pageTables.Length; i++)
                        {
                            tables.Add(pageTables[i] as Table);
                        }
                    }
                    pdfPage.Tables = tables;
                    pdfModel.Pages.Add(pdfPage);
                    cp++;
                }

                pdfModel.PageNumber = pdfModel.Pages.Count;

                return(PdfTextFormater(pdfModel, tableContainType));
            }

            return(null);
        }
예제 #25
0
        public void TestLinesToCells()
        {
            List <Cell> cells = SpreadsheetExtractionAlgorithm.FindCells(HORIZONTAL_RULING_LINES.ToList(), VERTICAL_RULING_LINES.ToList());

            Utils.Sort(cells, new TableRectangle.ILL_DEFINED_ORDER());
            List <Cell> expected = EXPECTED_CELLS.ToList();

            Utils.Sort(expected, new TableRectangle.ILL_DEFINED_ORDER());

            Assert.Equal(expected.Count, cells.Count);

            for (int i = 0; i < expected.Count; i++)
            {
                Assert.Equal(expected[i], cells[i]);
            }

            Assert.Equal(expected, cells);
        }
예제 #26
0
        [Fact]//(Skip = "TODO")]
        public void TestFindSpreadsheetsFromCells()
        {
            var         parse = UtilsForTesting.LoadCsvLines("Resources/csv/TestSpreadsheetExtractor-CELLS.csv");
            List <Cell> cells = new List <Cell>();

            foreach (var record in parse)
            {
                var top    = double.Parse(record[0]);   // top
                var left   = double.Parse(record[1]);   // left
                var width  = double.Parse(record[2]);   // width
                var height = double.Parse(record[3]);   // height
                cells.Add(new Cell(new PdfRectangle(left, top, left + width, top + height)));
            }

            List <TableRectangle> expected = EXPECTED_RECTANGLES.ToList();

            Utils.Sort(expected, new TableRectangle.ILL_DEFINED_ORDER());
            List <TableRectangle> foundRectangles = SpreadsheetExtractionAlgorithm.FindSpreadsheetsFromCells(cells.Cast <TableRectangle>().ToList());

            Utils.Sort(foundRectangles, new TableRectangle.ILL_DEFINED_ORDER());

            Assert.Equal(foundRectangles, expected);
        }
예제 #27
0
        public void TestSpreadsheetExtractionIssue656()
        {
            // page height = 482, width 762.3 // 612
            // top,     left,    bottom,   right
            // 56.925f, 24.255f, 549.945f, 786.555f);
            PageArea page        = UtilsForTesting.GetAreaFromFirstPage("Resources/Publication_of_award_of_Bids_for_Transport_Sector__August_2016.pdf", new PdfRectangle(24.255, 71, 786.555, 553));
            string   expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/Publication_of_award_of_Bids_for_Transport_Sector__August_2016.csv");

            SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
            List <Table> tables = sea.Extract(page);

            Assert.Single(tables);
            Table table = tables[0];

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, table);
            string result = sb.ToString();

            Assert.Equal(expectedCsv.Replace("\n", "\r"), result.Replace("\r\n", "\n").Replace("\n", "\r").Trim());

            /*
             * using (var stream = new MemoryStream())
             * using (var sb = new StreamWriter(stream) { AutoFlush = true })
             * {
             *  (new CSVWriter()).write(sb, table);
             *
             *  var reader = new StreamReader(stream);
             *  stream.Position = 0;
             *  var s = reader.ReadToEnd().Trim(); // trim to remove last new line
             *
             *  // is there an issue with \r and \n?
             *  Assert.Equal(expectedCsv.Replace("\n", "\r"), s.Replace("\r\n", "\n").Replace("\n", "\r"));
             * }
             */
        }
예제 #28
0
        /// <summary>
        /// Detects the tables in the page.
        /// </summary>
        /// <param name="page"></param>
        public List <TableRectangle> Detect(PageArea page)
        {
            // get horizontal & vertical lines
            // we get these from an image of the PDF and not the PDF itself because sometimes there are invisible PDF
            // instructions that are interpreted incorrectly as visible elements - we really want to capture what a
            // person sees when they look at the PDF
            // BobLd: hack here, we don't convert to an image
            var           pageRulings       = page.GetRulings();
            List <Ruling> horizontalRulings = this.getHorizontalRulings(pageRulings);
            List <Ruling> verticalRulings   = this.getVerticalRulings(pageRulings);
            // end hack here

            List <Ruling> allEdges = new List <Ruling>(horizontalRulings);

            allEdges.AddRange(verticalRulings);

            List <TableRectangle> tableAreas = new List <TableRectangle>();

            // if we found some edges, try to find some tables based on them
            if (allEdges.Count > 0)
            {
                // now we need to snap edge endpoints to a grid
                Utils.SnapPoints(allEdges, POINT_SNAP_DISTANCE_THRESHOLD, POINT_SNAP_DISTANCE_THRESHOLD);

                // normalize the rulings to make sure snapping didn't create any wacky non-horizontal/vertical rulings
                foreach (List <Ruling> rulings in new[] { horizontalRulings, verticalRulings }) //Arrays.asList(horizontalRulings, verticalRulings))
                {
                    //for (Iterator<Ruling> iterator = rulings.iterator(); iterator.hasNext();)
                    foreach (var ruling in rulings.ToList()) // use ToList to be able to remove
                    {
                        ruling.Normalize();
                        if (ruling.IsOblique)
                        {
                            rulings.Remove(ruling);
                        }
                    }
                }

                // merge the edge lines into rulings - this makes finding edges between crossing points in the next step easier
                // we use a larger pixel expansion than the normal spreadsheet extraction method to cover gaps in the
                // edge detection/pixel snapping steps
                horizontalRulings = Ruling.CollapseOrientedRulings(horizontalRulings, 5);
                verticalRulings   = Ruling.CollapseOrientedRulings(verticalRulings, 5);

                // use the rulings and points to find cells
                List <TableRectangle> cells = SpreadsheetExtractionAlgorithm.FindCells(horizontalRulings, verticalRulings).Cast <TableRectangle>().ToList();

                // then use those cells to make table areas
                tableAreas = getTableAreasFromCells(cells);
            }

            // next find any vertical rulings that intersect tables - sometimes these won't have completely been captured as
            // cells if there are missing horizontal lines (which there often are)
            // let's assume though that these lines should be part of the table
            foreach (Ruling verticalRuling in verticalRulings) // Line2D.Float
            {
                foreach (TableRectangle tableArea in tableAreas)
                {
                    if (verticalRuling.Intersects(tableArea) &&
                        !(tableArea.Contains(verticalRuling.P1) && tableArea.Contains(verticalRuling.P2)))
                    {
                        tableArea.SetTop(Math.Ceiling(Math.Max(tableArea.Top, verticalRuling.Y2)));     // bobld: Floor and Min, Y1
                        tableArea.SetBottom(Math.Floor(Math.Min(tableArea.Bottom, verticalRuling.Y1))); // bobld: Ceiling and Max, Y2
                        break;
                    }
                }
            }

            /* BobLd: not sure this is the case in tabula-sharp/PdfPig
             * // the tabula Page coordinate space is half the size of the PDFBox image coordinate space
             * // so halve the table area size before proceeding and add a bit of padding to make sure we capture everything
             * foreach (TableRectangle area in tableAreas)
             * {
             *  area.x = (float)Math.floor(area.x / 2) - TABLE_PADDING_AMOUNT;
             *  area.y = (float)Math.floor(area.y / 2) - TABLE_PADDING_AMOUNT;
             *  area.width = (float)Math.ceil(area.width / 2) + TABLE_PADDING_AMOUNT;
             *  area.height = (float)Math.ceil(area.height / 2) + TABLE_PADDING_AMOUNT;
             * }
             *
             * // we're going to want halved horizontal lines later too
             * foreach (Ruling ruling in horizontalRulings) // Line2D.Float
             * {
             *  ruling.x1 = ruling.x1 / 2;
             *  ruling.y1 = ruling.y1 / 2;
             *  ruling.x2 = ruling.x2 / 2;
             *  ruling.y2 = ruling.y2 / 2;
             * }
             */

            // now look at text rows to help us find more tables and flesh out existing ones
            List <TextChunk> textChunks = TextElement.MergeWords(page.GetText());
            List <TableLine> lines      = TextChunk.GroupByLines(textChunks);

            // first look for text rows that intersect an existing table - those lines should probably be part of the table
            foreach (TableLine textRow in lines)
            {
                foreach (TableRectangle tableArea in tableAreas)
                {
                    if (!tableArea.Contains(textRow) && textRow.Intersects(tableArea))
                    {
                        tableArea.SetLeft(Math.Floor(Math.Min(textRow.Left, tableArea.Left)));
                        tableArea.SetRight(Math.Ceiling(Math.Max(textRow.Right, tableArea.Right)));
                    }
                }
            }

            // get rid of tables that DO NOT intersect any text areas - these are likely graphs or some sort of graphic
            //for (Iterator<Rectangle> iterator = tableAreas.iterator(); iterator.hasNext();)
            foreach (TableRectangle table in tableAreas.ToList()) // use tolist to be able to remove
            {
                bool intersectsText = false;
                foreach (TableLine textRow in lines)
                {
                    if (table.Intersects(textRow))
                    {
                        intersectsText = true;
                        break;
                    }
                }

                if (!intersectsText)
                {
                    tableAreas.Remove(table);
                }
            }

            // lastly, there may be some tables that don't have any vertical rulings at all
            // we'll use text edges we've found to try and guess which text rows are part of a table

            // in his thesis nurminen goes through every row to try to assign a probability that the line is in a table
            // we're going to try a general heuristic instead, trying to find what type of edge (left/right/mid) intersects
            // the most text rows, and then use that magic number of "relevant" edges to decide what text rows should be
            // part of a table.

            bool foundTable;

            do
            {
                foundTable = false;

                // get rid of any text lines contained within existing tables, this allows us to find more tables
                //for (Iterator<TableLine> iterator = lines.iterator(); iterator.hasNext();)
                foreach (var textRow in lines.ToList())
                {
                    foreach (TableRectangle table in tableAreas)
                    {
                        if (table.Contains(textRow))
                        {
                            lines.Remove(textRow);
                            break;
                        }
                    }
                }

                // get text edges from remaining lines in the document
                TextEdges textEdges = getTextEdges(lines);
                //List<TextEdge> leftTextEdges = textEdges[TextEdge.LEFT];
                //List<TextEdge> midTextEdges = textEdges[TextEdge.MID];
                //List<TextEdge> rightTextEdges = textEdges[TextEdge.RIGHT];

                // find the relevant text edges (the ones we think define where a table is)
                RelevantEdges relevantEdgeInfo = getRelevantEdges(textEdges, lines);

                // we found something relevant so let's look for rows that fit our criteria
                if (relevantEdgeInfo.edgeType != -1)
                {
                    List <TextEdge> relevantEdges = null;
                    switch (relevantEdgeInfo.edgeType)
                    {
                    case TextEdge.LEFT:
                        relevantEdges = textEdges[TextEdge.LEFT];       // leftTextEdges;
                        break;

                    case TextEdge.MID:
                        relevantEdges = textEdges[TextEdge.MID];        // midTextEdges;
                        break;

                    case TextEdge.RIGHT:
                        relevantEdges = textEdges[TextEdge.RIGHT];      // rightTextEdges;
                        break;
                    }

                    TableRectangle table = getTableFromText(lines, relevantEdges, relevantEdgeInfo.edgeCount, horizontalRulings);

                    if (table != null)
                    {
                        foundTable = true;
                        tableAreas.Add(table);
                    }
                }
            } while (foundTable);

            // create a set of our current tables that will eliminate duplicate tables
            SortedSet <TableRectangle> tableSet = new SortedSet <TableRectangle>(new TreeSetComparer()); //Set<Rectangle> tableSet = new TreeSet<>(new Comparator<Rectangle>() {...

            foreach (var table in tableAreas.OrderByDescending(t => t.Area))
            {
                tableSet.Add(table);
            }

            return(tableSet.ToList());
        }