예제 #1
0
        public void TestExtractColumnsCorrectly2()
        {
            if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) // || RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
            {
                PageArea page = UtilsForTesting.GetPage(EU_017_PDF, 3);
                BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(page.VerticalRulings);
                Table table = bea.Extract(page.GetArea(new PdfRectangle(148.44, 543 - (711.875 - 299.625), 452.32, 543)))[0];

                var result = UtilsForTesting.TableToArrayOfRows(table);

                Assert.Equal(EU_017_EXPECTED.Length, result.Length);
                for (int i = 0; i < EU_017_EXPECTED.Length; i++)
                {
                    var expecteds = EU_017_EXPECTED[i];
                    var actuals   = result[i];
                    Assert.Equal(expecteds.Length, actuals.Length);
                    for (int j = 0; j < expecteds.Length; j++)
                    {
                        var e = expecteds[j];
                        var a = actuals[j];
                        Assert.Equal(e, a);
                    }
                }
            }
            else
            {
                // fails on linux and mac os. Linked to PdfPig not finding the correct font.
                // need to use apt-get -y install ttf-mscorefonts-installer
                // still have mscorefonts - eula license could not be presented
            }
        }
예제 #2
0
        public void Eu004()
        {
            using (PdfDocument document = PdfDocument.Open("Resources/icdar2013-dataset/competition-dataset-eu/eu-004.pdf", new ParsingOptions()
            {
                ClipPaths = true
            }))
            {
                ObjectExtractor oe   = new ObjectExtractor(document);
                PageArea        page = oe.Extract(3);

                var detector = new SimpleNurminenDetectionAlgorithm();
                var regions  = detector.Detect(page);

                var newArea = page.GetArea(regions[0].BoundingBox);

                var sea    = new SpreadsheetExtractionAlgorithm();
                var tables = sea.Extract(newArea);

                /*
                 * var detector = new SimpleNurminenDetectionAlgorithm();
                 * var regions = detector.Detect(page);
                 *
                 * foreach (var a in regions)
                 * {
                 *  IExtractionAlgorithm ea = new BasicExtractionAlgorithm();
                 *  var newArea = page.GetArea(a.BoundingBox);
                 *  List<Table> tables = ea.Extract(newArea);
                 * }
                 */
            }
        }
예제 #3
0
        public void TestLinesToCells()
        {
            using (PdfDocument document = PdfDocument.Open("test3.pdf", new ParsingOptions()
            {
                ClipPaths = true
            }))
            {
                ObjectExtractor oe   = new ObjectExtractor(document);
                PageArea        page = oe.Extract(1);

                SimpleNurminenDetectionAlgorithm detector = new SimpleNurminenDetectionAlgorithm();
                var regions = detector.Detect(page);

                foreach (var a in regions)
                {
                    IExtractionAlgorithm ea = new BasicExtractionAlgorithm();
                    var          newArea    = page.GetArea(a.BoundingBox);
                    List <Table> tables     = ea.Extract(newArea);
                }
            }
        }
예제 #4
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="page"></param>
        public bool IsTabular(PageArea page)
        {
            // if there's no text at all on the page, it's not a table
            // (we won't be able to do anything with it though)
            if (page.GetText().Count == 0)
            {
                return(false);
            }

            // get minimal region of page that contains every character (in effect,
            // removes white "margins")
            PageArea minimalRegion = page.GetArea(Utils.Bounds(page.GetText().Select(t => t.BoundingBox).ToList()));

            List <Table> tables = new SpreadsheetExtractionAlgorithm().Extract(minimalRegion);

            if (tables.Count == 0)
            {
                return(false);
            }

            Table table = tables[0];
            int   rowsDefinedByLines = table.RowCount;
            int   colsDefinedByLines = table.ColumnCount;

            tables = new BasicExtractionAlgorithm().Extract(minimalRegion);
            if (tables.Count == 0)
            {
                // TODO WHAT DO WE DO HERE?
                System.Diagnostics.Debug.Write("SpreadsheetExtractionAlgorithm.isTabular(): no table found.");
            }

            table = tables[0];
            int rowsDefinedWithoutLines = table.RowCount;
            int colsDefinedWithoutLines = table.ColumnCount;

            float ratio = (((float)colsDefinedByLines / colsDefinedWithoutLines) + ((float)rowsDefinedByLines / rowsDefinedWithoutLines)) / 2.0f;

            return(ratio > MAGIC_HEURISTIC_NUMBER && ratio < (1 / MAGIC_HEURISTIC_NUMBER));
        }