Exemplo n.º 1
0
        /// <summary>
        /// Gets table extraction area
        /// </summary>
        private static RectangleF?GetTableExtractionArea(string inputFile, int pageIndex = 0)
        {
            // Create Bytescout.PDFExtractor.TableDetector instance
            using (var tableDetector = new TableDetector("demo", "demo"))
            {
                // Set table detection mode to "bordered tables" - best for tables with closed solid borders.
                tableDetector.ColumnDetectionMode = ColumnDetectionMode.BorderedTables;

                // We should define what kind of tables we should detect.
                // So we set min required number of columns to 2 ...
                tableDetector.DetectionMinNumberOfColumns = 2;
                // ... and we set min required number of rows to 2
                tableDetector.DetectionMinNumberOfRows = 2;

                tableDetector.LoadDocumentFromFile(inputFile);

                if (!tableDetector.FindTable(pageIndex))
                {
                    return(null);
                }

                // Getting first found table location
                return(tableDetector.FoundTableLocation);
            }
        }
Exemplo n.º 2
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.JSONExtractor instance
            JSONExtractor jsonExtractor = new JSONExtractor();

            jsonExtractor.RegistrationName = "demo";
            jsonExtractor.RegistrationKey  = "demo";

            // Create Bytescout.PDFExtractor.TableDetector instance
            TableDetector tableDetector = new TableDetector();

            tableDetector.RegistrationKey  = "demo";
            tableDetector.RegistrationName = "demo";

            // Set table detection mode to "bordered tables" - best for tables with closed solid borders.
            tableDetector.ColumnDetectionMode = ColumnDetectionMode.BorderedTables;

            // We should define what kind of tables we should detect.
            // So we set min required number of columns to 3 ...
            tableDetector.DetectionMinNumberOfColumns = 3;
            // ... and we set min required number of rows to 3
            tableDetector.DetectionMinNumberOfRows = 3;

            // Load sample PDF document
            jsonExtractor.LoadDocumentFromFile(@".\sample3.pdf");
            tableDetector.LoadDocumentFromFile(@".\sample3.pdf");

            // Get page count
            int pageCount = tableDetector.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                int t = 1;
                // Find first table and continue if found
                if (tableDetector.FindTable(i))
                {
                    do
                    {
                        // Set extraction area for CSV extractor to rectangle received from the table detector
                        jsonExtractor.SetExtractionArea(tableDetector.FoundTableLocation);
                        // Export the table to JSON file
                        jsonExtractor.SaveJSONToFile(i, "page-" + i + "-table-" + t + ".json");
                        t++;
                    }while (tableDetector.FindNextTable()); // search next table
                }
            }

            // Cleanup
            jsonExtractor.Dispose();
            tableDetector.Dispose();

            // Open first output file in default associated application (for demo purposes)
            ProcessStartInfo processStartInfo = new ProcessStartInfo("page-0-table-1.json");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
Exemplo n.º 3
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            CSVExtractor extractor = new CSVExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            TableDetector tdetector = new TableDetector();

            tdetector.RegistrationKey  = "demo";
            tdetector.RegistrationName = "demo";

            // we should define what kind of tables we should detect
            // so we set min required number of columns to 3
            tdetector.DetectionMinNumberOfColumns = 3;

            // and we set min required number of columns to 3
            tdetector.DetectionMinNumberOfRows = 3;

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample3.pdf");
            tdetector.LoadDocumentFromFile("sample3.pdf");

            // Get page count
            int pageCount = tdetector.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                int j = 1;
                // find first table and continue if found
                if (tdetector.FindTable(i))
                {
                    do
                    {
                        // set extraction area for CSV extractor to rectangle given by table detector
                        extractor.SetExtractionArea(tdetector.GetFoundTableRectangle_Left(),
                                                    tdetector.GetFoundTableRectangle_Top(),
                                                    tdetector.GetFoundTableRectangle_Width(),
                                                    tdetector.GetFoundTableRectangle_Height()
                                                    );

                        // and finally save the table into CSV file
                        extractor.SavePageCSVToFile(i, "page-" + i + "-table-" + j + ".csv");
                        j++;
                    } while (tdetector.FindNextTable()); // search next table
                }
            }

            // Open first output file in default associated application
            System.Diagnostics.Process.Start("page-0-table-1.csv");
        }
Exemplo n.º 4
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.XMLExtractor instance
            XMLExtractor xmlExtractor = new XMLExtractor();

            xmlExtractor.RegistrationName = "demo";
            xmlExtractor.RegistrationKey  = "demo";

            // Create Bytescout.PDFExtractor.TableDetector instance
            TableDetector tableDetector = new TableDetector();

            tableDetector.RegistrationKey  = "demo";
            tableDetector.RegistrationName = "demo";

            // We should define what kind of tables we should detect.
            // So we set min required number of columns to 3 ...
            tableDetector.DetectionMinNumberOfColumns = 3;
            // ... and we set min required number of rows to 3
            tableDetector.DetectionMinNumberOfRows = 3;

            // Load sample PDF document
            xmlExtractor.LoadDocumentFromFile(@".\sample3.pdf");
            tableDetector.LoadDocumentFromFile(@".\sample3.pdf");

            // Get page count
            int pageCount = tableDetector.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                int t = 1;
                // Find first table and continue if found
                if (tableDetector.FindTable(i))
                {
                    do
                    {
                        // Set extraction area for XML extractor to rectangle received from the table detector
                        xmlExtractor.SetExtractionArea(tableDetector.FoundTableLocation);
                        // Export the table to XML file
                        xmlExtractor.SavePageXMLToFile(i, "page-" + i + "-table-" + t + ".xml");
                        t++;
                    }while (tableDetector.FindNextTable()); // search next table
                }
            }

            // Cleanup
            xmlExtractor.Dispose();
            tableDetector.Dispose();

            // Open first output file in default associated application (for demo purposes)
            System.Diagnostics.Process.Start("page-0-table-1.xml");
        }
Exemplo n.º 5
0
        //------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        internal static List <Dictionary <string, string> > ExtractInvoiceLineFields(string pdfPath)
        {
            List <Dictionary <string, string> > invoiceLineFields = null;

            // Initialise table detector
            using (TableDetector tableDetector = new TableDetector("demo", "demo"))
            {
                using (CSVExtractor CSVExtractor = new CSVExtractor("demo", "demo"))
                {
                    // Set table detection mode to "bordered tables" - best for tables with closed solid borders.
                    tableDetector.ColumnDetectionMode = ColumnDetectionMode.BorderedTables;

                    // We should define what kind of tables we should detect.
                    // So we set min required number of columns to 2 ...
                    tableDetector.DetectionMinNumberOfColumns = 2;
                    // ... and we set min required number of rows to 2
                    tableDetector.DetectionMinNumberOfRows = 1;

                    // Load PDF document
                    tableDetector.LoadDocumentFromFile(pdfPath);
                    CSVExtractor.LoadDocumentFromFile(pdfPath);

                    // Get page count
                    int pageCount = tableDetector.GetPageCount();

                    if (tableDetector.FindTable(pageCount - 1))
                    {
                        // Set extraction area for CSV extractor to rectangle received from the table detector
                        CSVExtractor.SetExtractionArea(tableDetector.FoundTableLocation);

                        // Generate CSV data
                        var allCsvData = CSVExtractor.GetCSV();

                        // Generate Datatable
                        invoiceLineFields = GetFieldsFromCSV(allCsvData);
                    }
                }
            }

            return(invoiceLineFields);
        }
        /// <summary>
        /// Get DataTable from Document
        /// </summary>
        private static DataTable GetDataTableFromDocument(string fileName)
        {
            DataTable oDataTable = null;

            // Initialise table detector
            using (TableDetector tableDetector = new TableDetector("demo", "demo"))
            {
                using (CSVExtractor CSVExtractor = new CSVExtractor("demo", "demo"))
                {
                    // Set table detection mode to "bordered tables" - best for tables with closed solid borders.
                    tableDetector.ColumnDetectionMode = ColumnDetectionMode.BorderedTables;

                    // We should define what kind of tables we should detect.
                    // So we set min required number of columns to 2 ...
                    tableDetector.DetectionMinNumberOfColumns = 2;
                    // ... and we set min required number of rows to 2
                    tableDetector.DetectionMinNumberOfRows = 2;

                    // Load PDF document
                    tableDetector.LoadDocumentFromFile(fileName);
                    CSVExtractor.LoadDocumentFromFile(fileName);

                    // Get page count
                    int pageCount = tableDetector.GetPageCount();

                    if (tableDetector.FindTable(0))
                    {
                        // Set extraction area for CSV extractor to rectangle received from the table detector
                        CSVExtractor.SetExtractionArea(tableDetector.FoundTableLocation);

                        // Generate CSV data
                        var allCsvData = CSVExtractor.GetCSV();

                        // Generate Datatable
                        oDataTable = GetDataTableFromCSV(allCsvData);
                    }
                }
            }

            return(oDataTable);
        }