コード例 #1
0
        public void TestExtractColumnsCorrectly()
        {
            if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) // || RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
            {
                PageArea page = UtilsForTesting.GetAreaFromPage(EU_002_PDF, 1, new PdfRectangle(70.0, 725 - (233 - 115), 510.0, 725));
                BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
                Table table = bea.Extract(page)[0];

                var actualArray = UtilsForTesting.TableToArrayOfRows(table);
                Assert.Equal(EU_002_EXPECTED.Length, actualArray.Length);

                for (int i = 0; i < EU_002_EXPECTED.Length; i++)
                {
                    var expecteds = EU_002_EXPECTED[i];
                    var actuals   = actualArray[i];
                    Assert.Equal(expecteds.Length, actuals.Length);
                    for (int j = 0; j < expecteds.Length; j++)
                    {
                        var e = expecteds[j];
                        var a = actuals[j];
                        Assert.Equal(e, a);
                    }
                }
            }
            else
            {
                // fails on linux and mac os. Linked to PdfPig not finding the correct font.
                // need to use apt-get -y install ttf-mscorefonts-installer
                // still have mscorefonts - eula license could not be presented
            }
        }
コード例 #2
0
        public void TestExtractColumnsCorrectly2()
        {
            if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) // || RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
            {
                PageArea page = UtilsForTesting.GetPage(EU_017_PDF, 3);
                BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(page.VerticalRulings);
                Table table = bea.Extract(page.GetArea(new PdfRectangle(148.44, 543 - (711.875 - 299.625), 452.32, 543)))[0];

                var result = UtilsForTesting.TableToArrayOfRows(table);

                Assert.Equal(EU_017_EXPECTED.Length, result.Length);
                for (int i = 0; i < EU_017_EXPECTED.Length; i++)
                {
                    var expecteds = EU_017_EXPECTED[i];
                    var actuals   = result[i];
                    Assert.Equal(expecteds.Length, actuals.Length);
                    for (int j = 0; j < expecteds.Length; j++)
                    {
                        var e = expecteds[j];
                        var a = actuals[j];
                        Assert.Equal(e, a);
                    }
                }
            }
            else
            {
                // fails on linux and mac os. Linked to PdfPig not finding the correct font.
                // need to use apt-get -y install ttf-mscorefonts-installer
                // still have mscorefonts - eula license could not be presented
            }
        }
コード例 #3
0
        private Table GetTable()
        {
            PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/argentina_diputados_voting_record.pdf", new PdfRectangle(12.75, 55.0, 561, 567)); // 269.875f, 12.75f, 790.5f, 561f);
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();

            return(bea.Extract(page)[0]);
        }
コード例 #4
0
        public void TestEmptyRegion()
        {
            PageArea page = UtilsForTesting.GetAreaFromPage("Resources/indictb1h_14.pdf", 1, new PdfRectangle(0, 700, 100.9, 800));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
            Table table = bea.Extract(page)[0];

            Assert.Equal(EXPECTED_EMPTY_TABLE, UtilsForTesting.TableToArrayOfRows(table));
        }
コード例 #5
0
        public void TestRemoveSequentialSpaces()
        {
            PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/m27.pdf", new PdfRectangle(28.28, 532 - (103.04 - 79.2), 732.6, 532));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
            Table table    = bea.Extract(page)[0];
            var   firstRow = table.Rows[0];

            Assert.Equal("ALLEGIANT AIR", firstRow[1].GetText());
            Assert.Equal("ALLEGIANT AIR LLC", firstRow[2].GetText());
        }
コード例 #6
0
        public void TestCheckSqueezeDoesntBreak()
        {
            PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/12s0324.pdf", new PdfRectangle(17.25, 342, 410.25, 560.5));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
            Table table             = bea.Extract(page)[0];
            var   rows              = table.Rows;
            var   firstRow          = rows[0];
            var   firstRowFirstCell = firstRow[0].GetText();
            var   lastRow           = rows[rows.Count - 1];
            var   lastRowLastCell   = lastRow[lastRow.Count - 1].GetText();

            Assert.Equal("Violent crime  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .", firstRowFirstCell);
            Assert.Equal("(X)", lastRowLastCell);
        }
コード例 #7
0
        public void StreamNoGuess1()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/data.pdf", 1);
            // data_stream_noguess.csv was modified for decimal precision
            string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/data_stream_noguess.csv");

            BasicExtractionAlgorithm se = new BasicExtractionAlgorithm();

            List <Table> tables = se.Extract(page);

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n"));
        }
コード例 #8
0
        public void StreamNoGuess1()
        {
            // tabula.read_pdf(pdf_path, stream=True, guess=False)

            PageArea page = UtilsForTesting.GetPage("Resources/data.pdf", 1);
            //
            string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/data_stream_noguess.csv");

            BasicExtractionAlgorithm se = new BasicExtractionAlgorithm();

            List <Table> tables = se.Extract(page);

            StringBuilder sb = new StringBuilder();

            (new CSVWriter()).Write(sb, tables[0]);
            Assert.Equal(expectedCsv, sb.ToString().Replace("\r\n", "\n"));
        }
コード例 #9
0
        public void TestVerticalRulingsPreventMergingOfColumns()
        {
            List <Ruling> rulings = new List <Ruling>();

            double[] rulingsVerticalPositions = { 147, 256, 310, 375, 431, 504 };
            for (int i = 0; i < 6; i++)
            {
                rulings.Add(new Ruling(new PdfPoint(rulingsVerticalPositions[i], 40.43), new PdfPoint(rulingsVerticalPositions[i], 755)));
            }

            PageArea page = UtilsForTesting.GetAreaFromFirstPage("Resources/campaign_donors.pdf", new PdfRectangle(40.43, 755 - (398.76 - 255.57), 557.35, 755));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(rulings);
            Table table    = bea.Extract(page)[0];
            var   sixthRow = table.Rows[5];

            Assert.Equal("VALSANGIACOMO BLANC", sixthRow[0].GetText());
            Assert.Equal("OFERNANDO JORGE", sixthRow[1].GetText());
        }
コード例 #10
0
        public void TestExtractColumnsCorrectly3()
        {
            PageArea page = UtilsForTesting.GetAreaFromFirstPage(FRX_2012_DISCLOSURE_PDF, new PdfRectangle(48.09, 563, 551.89, 685.5));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
            Table table  = bea.Extract(page)[0];
            var   result = UtilsForTesting.TableToArrayOfRows(table);

            Assert.Equal(FRX_2012_DISCLOSURE_EXPECTED.Length, result.Length);
            for (int i = 0; i < FRX_2012_DISCLOSURE_EXPECTED.Length; i++)
            {
                var expecteds = FRX_2012_DISCLOSURE_EXPECTED[i];
                var actuals   = result[i];
                Assert.Equal(expecteds.Length, actuals.Length);
                for (int j = 0; j < expecteds.Length; j++)
                {
                    var e = expecteds[j];
                    var a = actuals[j];
                    Assert.Equal(e, a);
                }
            }
        }
コード例 #11
0
        public void TestLinesToCells()
        {
            using (PdfDocument document = PdfDocument.Open("test3.pdf", new ParsingOptions()
            {
                ClipPaths = true
            }))
            {
                ObjectExtractor oe   = new ObjectExtractor(document);
                PageArea        page = oe.Extract(1);

                SimpleNurminenDetectionAlgorithm detector = new SimpleNurminenDetectionAlgorithm();
                var regions = detector.Detect(page);

                foreach (var a in regions)
                {
                    IExtractionAlgorithm ea = new BasicExtractionAlgorithm();
                    var          newArea    = page.GetArea(a.BoundingBox);
                    List <Table> tables     = ea.Extract(newArea);
                }
            }
        }
コード例 #12
0
        public void TestTableWithMultilineHeader()
        {
            string   expectedCsv         = UtilsForTesting.LoadCsv("Resources/csv/us-020.csv");
            PageArea page                = UtilsForTesting.GetAreaFromPage("Resources/us-020.pdf", 2, new PdfRectangle(35.0, 151, 560, 688.5));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
            Table table = bea.Extract(page)[0];

            using (var stream = new MemoryStream())
                using (var sb = new StreamWriter(stream)
                {
                    AutoFlush = true
                })
                {
                    (new CSVWriter()).Write(sb, table);

                    var reader = new StreamReader(stream);
                    stream.Position = 0;
                    var data = reader.ReadToEnd().Replace("\r\n", "\n").Trim(); // trim to remove last new line

                    Assert.Equal(expectedCsv, data);
                }
        }
コード例 #13
0
        public void TestRealLifeRTL2()
        {
            string   expectedCsv         = UtilsForTesting.LoadCsv("Resources/csv/indictb1h_14.csv");
            PageArea page                = UtilsForTesting.GetAreaFromPage("Resources/indictb1h_14.pdf", 1, new PdfRectangle(120.0, 842 - 622.82, 459.9, 842 - 120.0));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
            Table table = bea.Extract(page)[0];

            using (var stream = new MemoryStream())
                using (var sb = new StreamWriter(stream)
                {
                    AutoFlush = true
                })
                {
                    (new CSVWriter()).Write(sb, table);

                    var reader = new StreamReader(stream);
                    stream.Position = 0;
                    var data = reader.ReadToEnd().Replace("\r\n", "\n").Trim(); // trim to remove last new line

                    Assert.Equal(expectedCsv, data);
                }
        }
コード例 #14
0
        public void TestColumnRecognition()
        {
            PageArea page = UtilsForTesting.GetAreaFromFirstPage(ARGENTINA_DIPUTADOS_VOTING_RECORD_PDF, new PdfRectangle(12.75, 55, 557, 567));

            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
            Table table   = bea.Extract(page)[0];
            var   results = UtilsForTesting.TableToArrayOfRows(table);

            Assert.Equal(ARGENTINA_DIPUTADOS_VOTING_RECORD_EXPECTED.Length, results.Length);

            for (int i = 0; i < ARGENTINA_DIPUTADOS_VOTING_RECORD_EXPECTED.Length; i++)
            {
                var expected = ARGENTINA_DIPUTADOS_VOTING_RECORD_EXPECTED[i];
                var result   = results[i];
                Assert.Equal(expected.Length, result.Length);
                for (int j = 0; j < expected.Length; j++)
                {
                    var e = expected[j];
                    var r = result[j];
                    Assert.Equal(e, r);
                }
            }
        }
コード例 #15
0
        public void TestNaturalOrderOfRectangles()
        {
            PageArea page = UtilsForTesting.GetPage("Resources/us-017.pdf", 2).GetArea(new PdfRectangle(90, 97, 532, 352));
            BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(page.VerticalRulings);
            Table table = bea.Extract(page)[0];

            IReadOnlyList <Cell> cells = table.Cells;

            foreach (var rectangularTextContainer in cells)
            {
                Debug.Print(rectangularTextContainer.GetText());
            }

            // Now different form tabula-java, since PdfPig 0.1.5-alpha001

            //Column headers
            Assert.Equal("Project", cells[0].GetText());
            Assert.Equal("Agency", cells[1].GetText());
            Assert.Equal("Institution", cells[2].GetText());

            //First row
            Assert.Equal("Nanotechnology and its publics", cells[3].GetText());
            Assert.Equal("NSF", cells[4].GetText());
            Assert.Equal("Pennsylvania State University", cells[5].GetText());

            //Second row
            Assert.Equal("Public information and deliberation in nanoscience and\rnanotechnology policy (SGER)", cells[6].GetText());
            Assert.Equal("Interagency", cells[7].GetText());
            Assert.Equal("North Carolina State\rUniversity", cells[8].GetText());

            //Third row
            Assert.Equal("Social and ethical research and education in agrifood", cells[9].GetText());
            Assert.Equal("nanotechnology (NIRT)", cells[10].GetText());
            Assert.Equal("NSF", cells[11].GetText());
            Assert.Equal("Michigan State University", cells[12].GetText());

            //Fourth row
            Assert.Equal("From laboratory to society: developing an informed", cells[13].GetText());
            Assert.Equal("approach to nanoscale science and engineering (NIRT)", cells[14].GetText());
            Assert.Equal("NSF", cells[15].GetText());
            Assert.Equal("University of South Carolina", cells[16].GetText());

            //Fifth row
            Assert.Equal("Database and innovation timeline for nanotechnology", cells[17].GetText());
            Assert.Equal("NSF", cells[18].GetText());
            Assert.Equal("UCLA", cells[19].GetText());

            //Sixth row
            Assert.Equal("Social and ethical dimensions of nanotechnology", cells[20].GetText());
            Assert.Equal("NSF", cells[21].GetText());
            Assert.Equal("University of Virginia", cells[22].GetText());

            //Seventh row
            Assert.Equal("Undergraduate exploration of nanoscience,", cells[23].GetText());
            Assert.Equal("applications and societal implications (NUE)", cells[24].GetText());
            Assert.Equal("NSF", cells[25].GetText());
            Assert.Equal("Michigan Technological\rUniversity", cells[26].GetText());

            //Eighth row
            Assert.Equal("Ethics and belief inside the development of", cells[27].GetText());
            Assert.Equal("nanotechnology (CAREER)", cells[28].GetText());
            Assert.Equal("NSF", cells[29].GetText());
            Assert.Equal("University of Virginia", cells[30].GetText());

            //Ninth row
            Assert.Equal("All centers, NNIN and NCN have a societal", cells[31].GetText());
            Assert.Equal("NSF, DOE,", cells[32].GetText());
            Assert.Equal("All nanotechnology centers", cells[33].GetText());
            Assert.Equal("implications components", cells[34].GetText());
            Assert.Equal("DOD, and NIH", cells[35].GetText());
            Assert.Equal("and networks", cells[36].GetText());
        }