static void Main(string[] args) { // Create Bytescout.PDFExtractor.CSVExtractor instance CSVExtractor extractor = new CSVExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample3.pdf"); //extractor.CSVSeparatorSymbol = ","; // you can change CSV separator symbol (if needed) from "," symbol to another if needed for non-US locales // Get page count int pageCount = extractor.GetPageCount(); for (int i = 0; i < pageCount; i++) { string fileName = "page" + i + ".csv"; // Save extracted page text to file extractor.SavePageCSVToFile(i, fileName); } // Cleanup extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Data has been extracted to separate files for pages."); Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadKey(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.CSVExtractor instance CSVExtractor csvExtractor = new CSVExtractor(); csvExtractor.RegistrationName = "demo"; csvExtractor.RegistrationKey = "demo"; // Create Bytescout.PDFExtractor.TableDetector instance TableDetector tableDetector = new TableDetector(); tableDetector.RegistrationKey = "demo"; tableDetector.RegistrationName = "demo"; // Set table detection mode to "bordered tables" - best for tables with closed solid borders. tableDetector.ColumnDetectionMode = ColumnDetectionMode.BorderedTables; // We should define what kind of tables we should detect. // So we set min required number of columns to 3 ... tableDetector.DetectionMinNumberOfColumns = 3; // ... and we set min required number of rows to 3 tableDetector.DetectionMinNumberOfRows = 3; // Load sample PDF document csvExtractor.LoadDocumentFromFile(@".\sample3.pdf"); tableDetector.LoadDocumentFromFile(@".\sample3.pdf"); // Get page count int pageCount = tableDetector.GetPageCount(); for (int i = 0; i < pageCount; i++) { int t = 1; // Find first table and continue if found if (tableDetector.FindTable(i)) { do { // Set extraction area for CSV extractor to rectangle received from the table detector csvExtractor.SetExtractionArea(tableDetector.FoundTableLocation); // Export the table to CSV file csvExtractor.SavePageCSVToFile(i, "page-" + i + "-table-" + t + ".csv"); t++; }while (tableDetector.FindNextTable()); // search next table } } // Cleanup csvExtractor.Dispose(); tableDetector.Dispose(); // Open first output file in default associated application (for demo purposes) ProcessStartInfo processStartInfo = new ProcessStartInfo("page-0-table-1.csv"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance CSVExtractor extractor = new CSVExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; TableDetector tdetector = new TableDetector(); tdetector.RegistrationKey = "demo"; tdetector.RegistrationName = "demo"; // we should define what kind of tables we should detect // so we set min required number of columns to 3 tdetector.DetectionMinNumberOfColumns = 3; // and we set min required number of columns to 3 tdetector.DetectionMinNumberOfRows = 3; // Load sample PDF document extractor.LoadDocumentFromFile("sample3.pdf"); tdetector.LoadDocumentFromFile("sample3.pdf"); // Get page count int pageCount = tdetector.GetPageCount(); for (int i = 0; i < pageCount; i++) { int j = 1; // find first table and continue if found if (tdetector.FindTable(i)) { do { // set extraction area for CSV extractor to rectangle given by table detector extractor.SetExtractionArea(tdetector.GetFoundTableRectangle_Left(), tdetector.GetFoundTableRectangle_Top(), tdetector.GetFoundTableRectangle_Width(), tdetector.GetFoundTableRectangle_Height() ); // and finally save the table into CSV file extractor.SavePageCSVToFile(i, "page-" + i + "-table-" + j + ".csv"); j++; } while (tdetector.FindNextTable()); // search next table } } // Open first output file in default associated application System.Diagnostics.Process.Start("page-0-table-1.csv"); }
private void tsbExportToCSV_Click(object sender, EventArgs e) { // Get selections from viewer RectangleF[] selections = pdfViewerControl1.SelectionInPoints; string outputFile = @".\result.csv"; using (CSVExtractor csvExtractor = new CSVExtractor("demo", "demo")) { // Load document into extractor csvExtractor.LoadDocumentFromFile(pdfViewerControl1.InputFile); // Enable OCR to recongize text from images csvExtractor.OCRMode = OCRMode.Auto; csvExtractor.OCRResolution = 300; csvExtractor.OCRLanguage = "eng"; csvExtractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; // There are double spaces between some words in your document. // To avoid such words break column structure increase the space ratio to 2. csvExtractor.DetectNewColumnBySpacesRatio = 2; // FYI, removing horizontal lines may increase the text recognition quality in some cases csvExtractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover(); // Another filter able to improve the recognition //csvExtractor.OCRImagePreprocessingFilters.AddGammaCorrection(); // If selection exists set the extraction area. // Overwise it will extract the whole page. if (selections.Length > 0) { csvExtractor.SetExtractionArea(selections[0]); } // Save extraction results to CSV files csvExtractor.SavePageCSVToFile(pdfViewerControl1.CurrentPageIndex, outputFile); } Process.Start(outputFile); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.CSVExtractor instance CSVExtractor csvExtractor = new CSVExtractor(); csvExtractor.RegistrationName = "demo"; csvExtractor.RegistrationKey = "demo"; // Create Bytescout.PDFExtractor.TableDetector2 instance TableDetector2 tableDetector = new TableDetector2(); tableDetector.RegistrationKey = "demo"; tableDetector.RegistrationName = "demo"; // Load sample PDF document csvExtractor.LoadDocumentFromFile(@".\sample_borderless.pdf"); tableDetector.LoadDocumentFromFile(@".\sample_borderless.pdf"); // Get page count int pageCount = tableDetector.GetPageCount(); var extractedCsvFiles = new List <string>(); for (int pageIndex = 0; pageIndex < pageCount; pageIndex++) { var foundTables = tableDetector.FindTables(pageIndex).ToArray(); // Find first table and continue if found if (foundTables.Length > 0) { for (int indexTable = 0; indexTable < foundTables.Length; indexTable++) { // Set extraction area for CSV extractor to rectangle received from the table detector csvExtractor.SetExtractionArea(foundTables[indexTable].Bounds); // Result CSV file name var outputCsvName = $"page-{pageIndex + 1}-table-{indexTable + 1}.csv"; // Export the table to CSV file csvExtractor.SavePageCSVToFile(pageIndex, outputCsvName); extractedCsvFiles.Add(outputCsvName); } } } // Cleanup csvExtractor.Dispose(); tableDetector.Dispose(); // Show Summary Console.Clear(); if (extractedCsvFiles.Count > 0) { Console.WriteLine($"Total {extractedCsvFiles.Count} tables found!"); Console.WriteLine("--------------------------"); Console.WriteLine(string.Join("\n", extractedCsvFiles)); } else { Console.WriteLine("No Table Found!"); } Console.ReadLine(); }