static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.CSVExtractor instance
            CSVExtractor extractor = new CSVExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample3.pdf");

            //extractor.CSVSeparatorSymbol = ","; // you can change CSV separator symbol (if needed) from "," symbol to another if needed for non-US locales

            // Get page count
            int pageCount = extractor.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                string fileName = "page" + i + ".csv";

                // Save extracted page text to file
                extractor.SavePageCSVToFile(i, fileName);
            }

            // Cleanup
            extractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Data has been extracted to separate files for pages.");
            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadKey();
        }
Exemple #2
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.CSVExtractor instance
            CSVExtractor csvExtractor = new CSVExtractor();

            csvExtractor.RegistrationName = "demo";
            csvExtractor.RegistrationKey  = "demo";

            // Create Bytescout.PDFExtractor.TableDetector instance
            TableDetector tableDetector = new TableDetector();

            tableDetector.RegistrationKey  = "demo";
            tableDetector.RegistrationName = "demo";

            // Set table detection mode to "bordered tables" - best for tables with closed solid borders.
            tableDetector.ColumnDetectionMode = ColumnDetectionMode.BorderedTables;

            // We should define what kind of tables we should detect.
            // So we set min required number of columns to 3 ...
            tableDetector.DetectionMinNumberOfColumns = 3;
            // ... and we set min required number of rows to 3
            tableDetector.DetectionMinNumberOfRows = 3;

            // Load sample PDF document
            csvExtractor.LoadDocumentFromFile(@".\sample3.pdf");
            tableDetector.LoadDocumentFromFile(@".\sample3.pdf");

            // Get page count
            int pageCount = tableDetector.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                int t = 1;
                // Find first table and continue if found
                if (tableDetector.FindTable(i))
                {
                    do
                    {
                        // Set extraction area for CSV extractor to rectangle received from the table detector
                        csvExtractor.SetExtractionArea(tableDetector.FoundTableLocation);
                        // Export the table to CSV file
                        csvExtractor.SavePageCSVToFile(i, "page-" + i + "-table-" + t + ".csv");
                        t++;
                    }while (tableDetector.FindNextTable()); // search next table
                }
            }

            // Cleanup
            csvExtractor.Dispose();
            tableDetector.Dispose();

            // Open first output file in default associated application (for demo purposes)
            ProcessStartInfo processStartInfo = new ProcessStartInfo("page-0-table-1.csv");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            CSVExtractor extractor = new CSVExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            TableDetector tdetector = new TableDetector();

            tdetector.RegistrationKey  = "demo";
            tdetector.RegistrationName = "demo";

            // we should define what kind of tables we should detect
            // so we set min required number of columns to 3
            tdetector.DetectionMinNumberOfColumns = 3;

            // and we set min required number of columns to 3
            tdetector.DetectionMinNumberOfRows = 3;

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample3.pdf");
            tdetector.LoadDocumentFromFile("sample3.pdf");

            // Get page count
            int pageCount = tdetector.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                int j = 1;
                // find first table and continue if found
                if (tdetector.FindTable(i))
                {
                    do
                    {
                        // set extraction area for CSV extractor to rectangle given by table detector
                        extractor.SetExtractionArea(tdetector.GetFoundTableRectangle_Left(),
                                                    tdetector.GetFoundTableRectangle_Top(),
                                                    tdetector.GetFoundTableRectangle_Width(),
                                                    tdetector.GetFoundTableRectangle_Height()
                                                    );

                        // and finally save the table into CSV file
                        extractor.SavePageCSVToFile(i, "page-" + i + "-table-" + j + ".csv");
                        j++;
                    } while (tdetector.FindNextTable()); // search next table
                }
            }

            // Open first output file in default associated application
            System.Diagnostics.Process.Start("page-0-table-1.csv");
        }
        private void tsbExportToCSV_Click(object sender, EventArgs e)
        {
            // Get selections from viewer
            RectangleF[] selections = pdfViewerControl1.SelectionInPoints;

            string outputFile = @".\result.csv";

            using (CSVExtractor csvExtractor = new CSVExtractor("demo", "demo"))
            {
                // Load document into extractor
                csvExtractor.LoadDocumentFromFile(pdfViewerControl1.InputFile);

                // Enable OCR to recongize text from images
                csvExtractor.OCRMode               = OCRMode.Auto;
                csvExtractor.OCRResolution         = 300;
                csvExtractor.OCRLanguage           = "eng";
                csvExtractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";

                // There are double spaces between some words in your document.
                // To avoid such words break column structure increase the space ratio to 2.
                csvExtractor.DetectNewColumnBySpacesRatio = 2;

                // FYI, removing horizontal lines may increase the text recognition quality in some cases
                csvExtractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover();
                // Another filter able to improve the recognition
                //csvExtractor.OCRImagePreprocessingFilters.AddGammaCorrection();

                // If selection exists set the extraction area.
                // Overwise it will extract the whole page.
                if (selections.Length > 0)
                {
                    csvExtractor.SetExtractionArea(selections[0]);
                }

                // Save extraction results to CSV files
                csvExtractor.SavePageCSVToFile(pdfViewerControl1.CurrentPageIndex, outputFile);
            }

            Process.Start(outputFile);
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.CSVExtractor instance
            CSVExtractor csvExtractor = new CSVExtractor();

            csvExtractor.RegistrationName = "demo";
            csvExtractor.RegistrationKey  = "demo";

            // Create Bytescout.PDFExtractor.TableDetector2 instance
            TableDetector2 tableDetector = new TableDetector2();

            tableDetector.RegistrationKey  = "demo";
            tableDetector.RegistrationName = "demo";

            // Load sample PDF document
            csvExtractor.LoadDocumentFromFile(@".\sample_borderless.pdf");
            tableDetector.LoadDocumentFromFile(@".\sample_borderless.pdf");

            // Get page count
            int pageCount = tableDetector.GetPageCount();

            var extractedCsvFiles = new List <string>();

            for (int pageIndex = 0; pageIndex < pageCount; pageIndex++)
            {
                var foundTables = tableDetector.FindTables(pageIndex).ToArray();

                // Find first table and continue if found
                if (foundTables.Length > 0)
                {
                    for (int indexTable = 0; indexTable < foundTables.Length; indexTable++)
                    {
                        // Set extraction area for CSV extractor to rectangle received from the table detector
                        csvExtractor.SetExtractionArea(foundTables[indexTable].Bounds);

                        // Result CSV file name
                        var outputCsvName = $"page-{pageIndex + 1}-table-{indexTable + 1}.csv";

                        // Export the table to CSV file
                        csvExtractor.SavePageCSVToFile(pageIndex, outputCsvName);
                        extractedCsvFiles.Add(outputCsvName);
                    }
                }
            }

            // Cleanup
            csvExtractor.Dispose();
            tableDetector.Dispose();

            // Show Summary
            Console.Clear();
            if (extractedCsvFiles.Count > 0)
            {
                Console.WriteLine($"Total {extractedCsvFiles.Count} tables found!");
                Console.WriteLine("--------------------------");
                Console.WriteLine(string.Join("\n", extractedCsvFiles));
            }
            else
            {
                Console.WriteLine("No Table Found!");
            }

            Console.ReadLine();
        }