CSVExtractor.LoadDocumentFromFile C# (CSharp) 코드 예제들

예제 #1

0

파일 보기

파일: Program.cs 프로젝트: jboddiford/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.CSVExtractor instance
            CSVExtractor extractor = new CSVExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample3.pdf");

            // you can change CSV separator symbol (if needed) from "," symbol to another if needed for non-US locales
            //extractor.CSVSeparatorSymbol = ",";

            // Save extracted CSV data
            extractor.SaveCSVToFile("output.csv");

            // Cleanup
            extractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Data has been extracted to 'output.csv' file.");
            Console.WriteLine();
            Console.WriteLine("Press any key to continue and open CSV in default CSV viewer (or Excel)...");
            Console.ReadKey();

            // Open result document in default associated application (for demo purpose)
            ProcessStartInfo processStartInfo = new ProcessStartInfo("output.csv");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }

예제 #2

0

파일 보기

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.CSVExtractor instance
            CSVExtractor extractor = new CSVExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("borderless_table.pdf");

            // Set extraction columns explicitly.
            // Coordinates in CustomExtractionColumns must match the left edges of the columns.
            // To get coordinates in PDF points you can use PDF Multitool application
            // installed with the SDK. It shows mouse cursor coodinates in PDF points in the toolbar.
            extractor.CustomExtractionColumns = new double[] { 0, 124.5, 185, 241 };

            // Save extracted CSV data
            extractor.SaveCSVToFile("output.csv");

            // Cleanup
            extractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Data has been extracted to 'output.csv' file.");
            Console.WriteLine();
            Console.WriteLine("Press any key to continue and open CSV in default CSV viewer (or Excel)...");
            Console.ReadKey();

            // Open result document in default associated application (for demo purpose)
            ProcessStartInfo processStartInfo = new ProcessStartInfo("output.csv");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }

예제 #3

0

파일 보기

파일: Program.cs 프로젝트: babylon3389/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.CSVExtractor instance
            CSVExtractor extractor = new CSVExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample3.pdf");

            //extractor.CSVSeparatorSymbol = ","; // you can change CSV separator symbol (if needed) from "," symbol to another if needed for non-US locales

            extractor.SaveCSVToFile("output.csv");

            extractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Data has been extracted to 'output.csv' file.");
            Console.WriteLine();
            Console.WriteLine("Press any key to continue and open CSV in default CSV viewer (or Excel)...");
            Console.ReadKey();

            Process.Start("output.csv");
        }

예제 #4

0

파일 보기

파일: Program.cs 프로젝트: repohoarder/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.CSVExtractor instance
            CSVExtractor extractor = new CSVExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample3.pdf");

            //extractor.CSVSeparatorSymbol = ","; // you can change CSV separator symbol (if needed) from "," symbol to another if needed for non-US locales

            // Get page count
            int pageCount = extractor.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                string fileName = "page" + i + ".csv";

                // Save extracted page text to file
                extractor.SavePageCSVToFile(i, fileName);
            }

            // Cleanup
            extractor.Dispose();

            Console.WriteLine();
            Console.WriteLine("Data has been extracted to separate files for pages.");
            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadKey();
        }

예제 #5

0

파일 보기

파일: Program.cs 프로젝트: bytescout/pdf-extractor-sdk-samples-c-sharp

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.CSVExtractor instance
            CSVExtractor extractor = new CSVExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\SampleGroupDisabilityForm.pdf");

            // Set extraction area
            extractor.SetExtractionArea(new System.Drawing.RectangleF(27F, 324.8F, 554.3F, 358.5F));

            // Check whether rows can be grouped
            extractor.LineGroupingMode = LineGroupingMode.GroupByRows;

            // Extract results
            var outputFile = "result.csv";

            extractor.SaveCSVToFile(outputFile);

            // Cleanup
            extractor.Dispose();

            // Open with default associated program
            ProcessStartInfo processStartInfo = new ProcessStartInfo(outputFile);

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }

예제 #6

0

파일 보기

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.CSVExtractor instance
            CSVExtractor csvExtractor = new CSVExtractor();

            csvExtractor.RegistrationName = "demo";
            csvExtractor.RegistrationKey  = "demo";

            // Create Bytescout.PDFExtractor.TableDetector instance
            TableDetector tableDetector = new TableDetector();

            tableDetector.RegistrationKey  = "demo";
            tableDetector.RegistrationName = "demo";

            // Set table detection mode to "bordered tables" - best for tables with closed solid borders.
            tableDetector.ColumnDetectionMode = ColumnDetectionMode.BorderedTables;

            // We should define what kind of tables we should detect.
            // So we set min required number of columns to 3 ...
            tableDetector.DetectionMinNumberOfColumns = 3;
            // ... and we set min required number of rows to 3
            tableDetector.DetectionMinNumberOfRows = 3;

            // Load sample PDF document
            csvExtractor.LoadDocumentFromFile(@".\sample3.pdf");
            tableDetector.LoadDocumentFromFile(@".\sample3.pdf");

            // Get page count
            int pageCount = tableDetector.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                int t = 1;
                // Find first table and continue if found
                if (tableDetector.FindTable(i))
                {
                    do
                    {
                        // Set extraction area for CSV extractor to rectangle received from the table detector
                        csvExtractor.SetExtractionArea(tableDetector.FoundTableLocation);
                        // Export the table to CSV file
                        csvExtractor.SavePageCSVToFile(i, "page-" + i + "-table-" + t + ".csv");
                        t++;
                    }while (tableDetector.FindNextTable()); // search next table
                }
            }

            // Cleanup
            csvExtractor.Dispose();
            tableDetector.Dispose();

            // Open first output file in default associated application (for demo purposes)
            ProcessStartInfo processStartInfo = new ProcessStartInfo("page-0-table-1.csv");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }

예제 #7

0

파일 보기

파일: ConvertPdftoCsv.cs 프로젝트: Dartasien/ExtractBatchReports

    public void Convert(string filename)
    {
        // Create Bytescout.PDFExtractor.CSVExtractor instance
        // Load sample PDF document
        CsvExtractor.LoadDocumentFromFile(filename);
        //extractor.CSVSeparatorSymbol = ","; // you can change CSV separator symbol (if needed) from "," symbol to another if needed for non-US locales

        CsvExtractor.SaveCSVToFile(filename.Replace(".pdf", ".csv"));
    }

예제 #8

0

파일 보기

파일: Program.cs 프로젝트: remlex/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            CSVExtractor extractor = new CSVExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            TableDetector tdetector = new TableDetector();

            tdetector.RegistrationKey  = "demo";
            tdetector.RegistrationName = "demo";

            // we should define what kind of tables we should detect
            // so we set min required number of columns to 3
            tdetector.DetectionMinNumberOfColumns = 3;

            // and we set min required number of columns to 3
            tdetector.DetectionMinNumberOfRows = 3;

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample3.pdf");
            tdetector.LoadDocumentFromFile("sample3.pdf");

            // Get page count
            int pageCount = tdetector.GetPageCount();

            for (int i = 0; i < pageCount; i++)
            {
                int j = 1;
                // find first table and continue if found
                if (tdetector.FindTable(i))
                {
                    do
                    {
                        // set extraction area for CSV extractor to rectangle given by table detector
                        extractor.SetExtractionArea(tdetector.GetFoundTableRectangle_Left(),
                                                    tdetector.GetFoundTableRectangle_Top(),
                                                    tdetector.GetFoundTableRectangle_Width(),
                                                    tdetector.GetFoundTableRectangle_Height()
                                                    );

                        // and finally save the table into CSV file
                        extractor.SavePageCSVToFile(i, "page-" + i + "-table-" + j + ".csv");
                        j++;
                    } while (tdetector.FindNextTable()); // search next table
                }
            }

            // Open first output file in default associated application
            System.Diagnostics.Process.Start("page-0-table-1.csv");
        }

예제 #9

0

파일 보기

        //------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        internal static List <Dictionary <string, string> > ExtractInvoiceLineFields(string pdfPath)
        {
            List <Dictionary <string, string> > invoiceLineFields = null;

            // Initialise table detector
            using (TableDetector tableDetector = new TableDetector("demo", "demo"))
            {
                using (CSVExtractor CSVExtractor = new CSVExtractor("demo", "demo"))
                {
                    // Set table detection mode to "bordered tables" - best for tables with closed solid borders.
                    tableDetector.ColumnDetectionMode = ColumnDetectionMode.BorderedTables;

                    // We should define what kind of tables we should detect.
                    // So we set min required number of columns to 2 ...
                    tableDetector.DetectionMinNumberOfColumns = 2;
                    // ... and we set min required number of rows to 2
                    tableDetector.DetectionMinNumberOfRows = 1;

                    // Load PDF document
                    tableDetector.LoadDocumentFromFile(pdfPath);
                    CSVExtractor.LoadDocumentFromFile(pdfPath);

                    // Get page count
                    int pageCount = tableDetector.GetPageCount();

                    if (tableDetector.FindTable(pageCount - 1))
                    {
                        // Set extraction area for CSV extractor to rectangle received from the table detector
                        CSVExtractor.SetExtractionArea(tableDetector.FoundTableLocation);

                        // Generate CSV data
                        var allCsvData = CSVExtractor.GetCSV();

                        // Generate Datatable
                        invoiceLineFields = GetFieldsFromCSV(allCsvData);
                    }
                }
            }

            return(invoiceLineFields);
        }

예제 #10

0

파일 보기

파일: Program.cs 프로젝트: jboddiford/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            try
            {
                // Generate CSVExtractor instance
                using (CSVExtractor extractor = new CSVExtractor("demo", "demo"))
                {
                    // Load PDF document
                    extractor.LoadDocumentFromFile("sample.pdf");

                    // Get all data
                    string allData = extractor.GetCSV();

                    // Regular expressions and replacements
                    string ssnRegex   = @"\d{3}[-]?\d{2}[-]?\d{4}";
                    string ssnReplace = "***-**-****";

                    string phoneRegex   = @"\d{3}[-]?\d{3}[-]?\d{4}";
                    string phoneReplace = "***-***-****";

                    // Find and mask SSN and phone numbers
                    allData = Regex.Replace(allData, ssnRegex, ssnReplace);
                    allData = Regex.Replace(allData, phoneRegex, phoneReplace);

                    // Write as CSV
                    File.WriteAllText("output.csv", allData);

                    // Open file
                    ProcessStartInfo processStartInfo = new ProcessStartInfo("output.csv");
                    processStartInfo.UseShellExecute = true;
                    Process.Start(processStartInfo);
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }

            Console.WriteLine("Press enter key to close...");
            Console.ReadLine();
        }

예제 #11

0

파일 보기

        static void Main(string[] args)
        {
            //SautinSoft.PdfFocus f = new SautinSoft.PdfFocus();
            //f.OpenPdf(@"C:\Users\i.upadhyay\Desktop\AV42104576.pdf");

            //if (f.PageCount > 0)
            //{
            //    f.ToWord(@"C:\Users\i.upadhyay\Desktop\AV42104576.docx");
            //    Console.WriteLine("Finised");
            //    Console.ReadKey();
            //}
            //string fileName = "myfile.ext";
            //string path1 = @"mydir";
            //string path2 = @"\mydir";
            string fullPath;

            CSVExtractor extractor = new CSVExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("AV42104576.pdf");

            //extractor.CSVSeparatorSymbol = ","; // you can change CSV separator symbol (if needed) from "," symbol to another if needed for non-US locales

            extractor.SaveCSVToFile("output1.csv");

            Console.WriteLine();
            Console.WriteLine("Data has been extracted to 'output.csv' file.");
            Console.WriteLine();
            Console.WriteLine("Press any key to continue and open CSV in default CSV viewer (or Excel)...");
            Console.ReadKey();

            Process.Start("output.csv");


            //fullPath = System.IO.Path.GetFullPath(@"C:\Users\i.upadhyay\Desktop\AV42104576.pdf");
            //Program n = new Program();
            //n.ExportPDFToExcel(fullPath);
        }

예제 #12

0

파일 보기

파일: Program.cs 프로젝트: repohoarder/ByteScout-SDK-SourceCode

        /// <summary>
        /// Get DataTable from Document
        /// </summary>
        private static DataTable GetDataTableFromDocument(string fileName)
        {
            DataTable oDataTable = null;

            // Initialise table detector
            using (TableDetector tableDetector = new TableDetector("demo", "demo"))
            {
                using (CSVExtractor CSVExtractor = new CSVExtractor("demo", "demo"))
                {
                    // Set table detection mode to "bordered tables" - best for tables with closed solid borders.
                    tableDetector.ColumnDetectionMode = ColumnDetectionMode.BorderedTables;

                    // We should define what kind of tables we should detect.
                    // So we set min required number of columns to 2 ...
                    tableDetector.DetectionMinNumberOfColumns = 2;
                    // ... and we set min required number of rows to 2
                    tableDetector.DetectionMinNumberOfRows = 2;

                    // Load PDF document
                    tableDetector.LoadDocumentFromFile(fileName);
                    CSVExtractor.LoadDocumentFromFile(fileName);

                    // Get page count
                    int pageCount = tableDetector.GetPageCount();

                    if (tableDetector.FindTable(0))
                    {
                        // Set extraction area for CSV extractor to rectangle received from the table detector
                        CSVExtractor.SetExtractionArea(tableDetector.FoundTableLocation);

                        // Generate CSV data
                        var allCsvData = CSVExtractor.GetCSV();

                        // Generate Datatable
                        oDataTable = GetDataTableFromCSV(allCsvData);
                    }
                }
            }

            return(oDataTable);
        }

예제 #13

0

파일 보기

파일: Form1.cs 프로젝트: bytescout/bytescout-showcases

        private void tsbExportToCSV_Click(object sender, EventArgs e)
        {
            // Get selections from viewer
            RectangleF[] selections = pdfViewerControl1.SelectionInPoints;

            string outputFile = @".\result.csv";

            using (CSVExtractor csvExtractor = new CSVExtractor("demo", "demo"))
            {
                // Load document into extractor
                csvExtractor.LoadDocumentFromFile(pdfViewerControl1.InputFile);

                // Enable OCR to recongize text from images
                csvExtractor.OCRMode               = OCRMode.Auto;
                csvExtractor.OCRResolution         = 300;
                csvExtractor.OCRLanguage           = "eng";
                csvExtractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";

                // There are double spaces between some words in your document.
                // To avoid such words break column structure increase the space ratio to 2.
                csvExtractor.DetectNewColumnBySpacesRatio = 2;

                // FYI, removing horizontal lines may increase the text recognition quality in some cases
                csvExtractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover();
                // Another filter able to improve the recognition
                //csvExtractor.OCRImagePreprocessingFilters.AddGammaCorrection();

                // If selection exists set the extraction area.
                // Overwise it will extract the whole page.
                if (selections.Length > 0)
                {
                    csvExtractor.SetExtractionArea(selections[0]);
                }

                // Save extraction results to CSV files
                csvExtractor.SavePageCSVToFile(pdfViewerControl1.CurrentPageIndex, outputFile);
            }

            Process.Start(outputFile);
        }

예제 #14

0

파일 보기

파일: Program.cs 프로젝트: bytescout/pdf-extractor-sdk-samples-c-sharp

        static void Main(string[] args)
        {
            string inputDocument = Path.GetFullPath(@".\UnicodeSample.pdf");
            string csvFilePath   = Path.ChangeExtension(inputDocument, ".csv");
            string csvFileName   = Path.GetFileName(csvFilePath);
            string csvDirectory  = Path.GetDirectoryName(Path.GetFullPath(csvFilePath));

            // Create Bytescout.PDFExtractor.CSVExtractor instance
            using (CSVExtractor extractor = new CSVExtractor("demo", "demo"))
            {
                extractor.LoadDocumentFromFile(inputDocument);

                extractor.CSVSeparatorSymbol = ",";

                string csvText = extractor.GetCSV();

                // Save csv text in UTF-8 encoding without BOM (byte order mark):
                File.WriteAllText(csvFilePath, csvText);
            }

            // Please Note: Target the project to x86 because Microsoft.Jet.OLEDB.4.0 driver is 32-bit only.
            using (OleDbConnection connection = new OleDbConnection($@"Provider=Microsoft.Jet.OLEDB.4.0;Data Source=""{csvDirectory}"";Extended Properties=""Text;FMT=$;HDR=No;CharacterSet=65001"""))
            {
                using (OleDbCommand command = new OleDbCommand($"select * from [{csvFileName}]", connection))
                {
                    using (OleDbDataAdapter adapter = new OleDbDataAdapter(command))
                    {
                        DataTable table = new DataTable();
                        table.Locale = CultureInfo.CurrentCulture;
                        adapter.Fill(table);

                        Console.WriteLine($"Loaded {table.Rows.Count} lines.");
                    }
                }
            }

            Console.ReadKey();
        }

예제 #15

0

파일 보기

파일: Program.cs 프로젝트: bytescout/data-extraction-suite-samples-c-sharp

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.CSVExtractor instance
            CSVExtractor extractor = new CSVExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample_ocr.pdf");

            // Enable Optical Character Recognition (OCR)
            // in .Auto mode (SDK automatically checks if needs to use OCR or not)
            extractor.OCRMode = OCRMode.Auto;

            // Set the location of OCR language data files
            extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";

            // Set OCR language
            extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
            // Find more language files at https://github.com/bytescout/ocrdata

            // Set PDF document rendering resolution
            extractor.OCRResolution = 300;


            // You can also apply various preprocessing filters
            // to improve the recognition on low-quality scans.

            // Automatically deskew skewed scans
            //extractor.OCRImagePreprocessingFilters.AddDeskew();

            // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors)
            //extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover();
            //extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover();

            // Repair broken letters
            //extractor.OCRImagePreprocessingFilters.AddDilate();

            // Remove noise
            //extractor.OCRImagePreprocessingFilters.AddMedian();

            // Apply Gamma Correction
            //extractor.OCRImagePreprocessingFilters.AddGammaCorrection();

            // Add Contrast
            //extractor.OCRImagePreprocessingFilters.AddContrast(20);


            // (!) You can use new OCRAnalyser class to find an optimal set of image preprocessing
            // filters for your specific document.
            // See "OCR Analyser" example.


            // Save extracted text to file
            extractor.SaveCSVToFile("output.csv");

            // Cleanup
            extractor.Dispose();

            // Open result document in default associated application (for demo purpose)
            ProcessStartInfo processStartInfo = new ProcessStartInfo("output.csv");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }

예제 #16

0

파일 보기

파일: Program.cs 프로젝트: bytescout/pdf-extractor-sdk-samples-c-sharp

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.CSVExtractor instance
            CSVExtractor csvExtractor = new CSVExtractor();

            csvExtractor.RegistrationName = "demo";
            csvExtractor.RegistrationKey  = "demo";

            // Create Bytescout.PDFExtractor.TableDetector2 instance
            TableDetector2 tableDetector = new TableDetector2();

            tableDetector.RegistrationKey  = "demo";
            tableDetector.RegistrationName = "demo";

            // Load sample PDF document
            csvExtractor.LoadDocumentFromFile(@".\sample_borderless.pdf");
            tableDetector.LoadDocumentFromFile(@".\sample_borderless.pdf");

            // Get page count
            int pageCount = tableDetector.GetPageCount();

            var extractedCsvFiles = new List <string>();

            for (int pageIndex = 0; pageIndex < pageCount; pageIndex++)
            {
                var foundTables = tableDetector.FindTables(pageIndex).ToArray();

                // Find first table and continue if found
                if (foundTables.Length > 0)
                {
                    for (int indexTable = 0; indexTable < foundTables.Length; indexTable++)
                    {
                        // Set extraction area for CSV extractor to rectangle received from the table detector
                        csvExtractor.SetExtractionArea(foundTables[indexTable].Bounds);

                        // Result CSV file name
                        var outputCsvName = $"page-{pageIndex + 1}-table-{indexTable + 1}.csv";

                        // Export the table to CSV file
                        csvExtractor.SavePageCSVToFile(pageIndex, outputCsvName);
                        extractedCsvFiles.Add(outputCsvName);
                    }
                }
            }

            // Cleanup
            csvExtractor.Dispose();
            tableDetector.Dispose();

            // Show Summary
            Console.Clear();
            if (extractedCsvFiles.Count > 0)
            {
                Console.WriteLine($"Total {extractedCsvFiles.Count} tables found!");
                Console.WriteLine("--------------------------");
                Console.WriteLine(string.Join("\n", extractedCsvFiles));
            }
            else
            {
                Console.WriteLine("No Table Found!");
            }

            Console.ReadLine();
        }

C# (CSharp) CSVExtractor.LoadDocumentFromFile 예제들