static void Main(string[] args) { // Create Bytescout.PDFExtractor.CSVExtractor instance CSVExtractor extractor = new CSVExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample3.pdf"); // you can change CSV separator symbol (if needed) from "," symbol to another if needed for non-US locales //extractor.CSVSeparatorSymbol = ","; // Save extracted CSV data extractor.SaveCSVToFile("output.csv"); // Cleanup extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Data has been extracted to 'output.csv' file."); Console.WriteLine(); Console.WriteLine("Press any key to continue and open CSV in default CSV viewer (or Excel)..."); Console.ReadKey(); // Open result document in default associated application (for demo purpose) ProcessStartInfo processStartInfo = new ProcessStartInfo("output.csv"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.CSVExtractor instance CSVExtractor extractor = new CSVExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("borderless_table.pdf"); // Set extraction columns explicitly. // Coordinates in CustomExtractionColumns must match the left edges of the columns. // To get coordinates in PDF points you can use PDF Multitool application // installed with the SDK. It shows mouse cursor coodinates in PDF points in the toolbar. extractor.CustomExtractionColumns = new double[] { 0, 124.5, 185, 241 }; // Save extracted CSV data extractor.SaveCSVToFile("output.csv"); // Cleanup extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Data has been extracted to 'output.csv' file."); Console.WriteLine(); Console.WriteLine("Press any key to continue and open CSV in default CSV viewer (or Excel)..."); Console.ReadKey(); // Open result document in default associated application (for demo purpose) ProcessStartInfo processStartInfo = new ProcessStartInfo("output.csv"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.CSVExtractor instance CSVExtractor extractor = new CSVExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample3.pdf"); //extractor.CSVSeparatorSymbol = ","; // you can change CSV separator symbol (if needed) from "," symbol to another if needed for non-US locales extractor.SaveCSVToFile("output.csv"); extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Data has been extracted to 'output.csv' file."); Console.WriteLine(); Console.WriteLine("Press any key to continue and open CSV in default CSV viewer (or Excel)..."); Console.ReadKey(); Process.Start("output.csv"); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.CSVExtractor instance CSVExtractor extractor = new CSVExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample3.pdf"); //extractor.CSVSeparatorSymbol = ","; // you can change CSV separator symbol (if needed) from "," symbol to another if needed for non-US locales // Get page count int pageCount = extractor.GetPageCount(); for (int i = 0; i < pageCount; i++) { string fileName = "page" + i + ".csv"; // Save extracted page text to file extractor.SavePageCSVToFile(i, fileName); } // Cleanup extractor.Dispose(); Console.WriteLine(); Console.WriteLine("Data has been extracted to separate files for pages."); Console.WriteLine(); Console.WriteLine("Press any key to continue..."); Console.ReadKey(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.CSVExtractor instance CSVExtractor extractor = new CSVExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\SampleGroupDisabilityForm.pdf"); // Set extraction area extractor.SetExtractionArea(new System.Drawing.RectangleF(27F, 324.8F, 554.3F, 358.5F)); // Check whether rows can be grouped extractor.LineGroupingMode = LineGroupingMode.GroupByRows; // Extract results var outputFile = "result.csv"; extractor.SaveCSVToFile(outputFile); // Cleanup extractor.Dispose(); // Open with default associated program ProcessStartInfo processStartInfo = new ProcessStartInfo(outputFile); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.CSVExtractor instance CSVExtractor csvExtractor = new CSVExtractor(); csvExtractor.RegistrationName = "demo"; csvExtractor.RegistrationKey = "demo"; // Create Bytescout.PDFExtractor.TableDetector instance TableDetector tableDetector = new TableDetector(); tableDetector.RegistrationKey = "demo"; tableDetector.RegistrationName = "demo"; // Set table detection mode to "bordered tables" - best for tables with closed solid borders. tableDetector.ColumnDetectionMode = ColumnDetectionMode.BorderedTables; // We should define what kind of tables we should detect. // So we set min required number of columns to 3 ... tableDetector.DetectionMinNumberOfColumns = 3; // ... and we set min required number of rows to 3 tableDetector.DetectionMinNumberOfRows = 3; // Load sample PDF document csvExtractor.LoadDocumentFromFile(@".\sample3.pdf"); tableDetector.LoadDocumentFromFile(@".\sample3.pdf"); // Get page count int pageCount = tableDetector.GetPageCount(); for (int i = 0; i < pageCount; i++) { int t = 1; // Find first table and continue if found if (tableDetector.FindTable(i)) { do { // Set extraction area for CSV extractor to rectangle received from the table detector csvExtractor.SetExtractionArea(tableDetector.FoundTableLocation); // Export the table to CSV file csvExtractor.SavePageCSVToFile(i, "page-" + i + "-table-" + t + ".csv"); t++; }while (tableDetector.FindNextTable()); // search next table } } // Cleanup csvExtractor.Dispose(); tableDetector.Dispose(); // Open first output file in default associated application (for demo purposes) ProcessStartInfo processStartInfo = new ProcessStartInfo("page-0-table-1.csv"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
public void Convert(string filename) { // Create Bytescout.PDFExtractor.CSVExtractor instance // Load sample PDF document CsvExtractor.LoadDocumentFromFile(filename); //extractor.CSVSeparatorSymbol = ","; // you can change CSV separator symbol (if needed) from "," symbol to another if needed for non-US locales CsvExtractor.SaveCSVToFile(filename.Replace(".pdf", ".csv")); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance CSVExtractor extractor = new CSVExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; TableDetector tdetector = new TableDetector(); tdetector.RegistrationKey = "demo"; tdetector.RegistrationName = "demo"; // we should define what kind of tables we should detect // so we set min required number of columns to 3 tdetector.DetectionMinNumberOfColumns = 3; // and we set min required number of columns to 3 tdetector.DetectionMinNumberOfRows = 3; // Load sample PDF document extractor.LoadDocumentFromFile("sample3.pdf"); tdetector.LoadDocumentFromFile("sample3.pdf"); // Get page count int pageCount = tdetector.GetPageCount(); for (int i = 0; i < pageCount; i++) { int j = 1; // find first table and continue if found if (tdetector.FindTable(i)) { do { // set extraction area for CSV extractor to rectangle given by table detector extractor.SetExtractionArea(tdetector.GetFoundTableRectangle_Left(), tdetector.GetFoundTableRectangle_Top(), tdetector.GetFoundTableRectangle_Width(), tdetector.GetFoundTableRectangle_Height() ); // and finally save the table into CSV file extractor.SavePageCSVToFile(i, "page-" + i + "-table-" + j + ".csv"); j++; } while (tdetector.FindNextTable()); // search next table } } // Open first output file in default associated application System.Diagnostics.Process.Start("page-0-table-1.csv"); }
//------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ internal static List <Dictionary <string, string> > ExtractInvoiceLineFields(string pdfPath) { List <Dictionary <string, string> > invoiceLineFields = null; // Initialise table detector using (TableDetector tableDetector = new TableDetector("demo", "demo")) { using (CSVExtractor CSVExtractor = new CSVExtractor("demo", "demo")) { // Set table detection mode to "bordered tables" - best for tables with closed solid borders. tableDetector.ColumnDetectionMode = ColumnDetectionMode.BorderedTables; // We should define what kind of tables we should detect. // So we set min required number of columns to 2 ... tableDetector.DetectionMinNumberOfColumns = 2; // ... and we set min required number of rows to 2 tableDetector.DetectionMinNumberOfRows = 1; // Load PDF document tableDetector.LoadDocumentFromFile(pdfPath); CSVExtractor.LoadDocumentFromFile(pdfPath); // Get page count int pageCount = tableDetector.GetPageCount(); if (tableDetector.FindTable(pageCount - 1)) { // Set extraction area for CSV extractor to rectangle received from the table detector CSVExtractor.SetExtractionArea(tableDetector.FoundTableLocation); // Generate CSV data var allCsvData = CSVExtractor.GetCSV(); // Generate Datatable invoiceLineFields = GetFieldsFromCSV(allCsvData); } } } return(invoiceLineFields); }
static void Main(string[] args) { try { // Generate CSVExtractor instance using (CSVExtractor extractor = new CSVExtractor("demo", "demo")) { // Load PDF document extractor.LoadDocumentFromFile("sample.pdf"); // Get all data string allData = extractor.GetCSV(); // Regular expressions and replacements string ssnRegex = @"\d{3}[-]?\d{2}[-]?\d{4}"; string ssnReplace = "***-**-****"; string phoneRegex = @"\d{3}[-]?\d{3}[-]?\d{4}"; string phoneReplace = "***-***-****"; // Find and mask SSN and phone numbers allData = Regex.Replace(allData, ssnRegex, ssnReplace); allData = Regex.Replace(allData, phoneRegex, phoneReplace); // Write as CSV File.WriteAllText("output.csv", allData); // Open file ProcessStartInfo processStartInfo = new ProcessStartInfo("output.csv"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); } } catch (Exception ex) { Console.WriteLine(ex.Message); } Console.WriteLine("Press enter key to close..."); Console.ReadLine(); }
static void Main(string[] args) { //SautinSoft.PdfFocus f = new SautinSoft.PdfFocus(); //f.OpenPdf(@"C:\Users\i.upadhyay\Desktop\AV42104576.pdf"); //if (f.PageCount > 0) //{ // f.ToWord(@"C:\Users\i.upadhyay\Desktop\AV42104576.docx"); // Console.WriteLine("Finised"); // Console.ReadKey(); //} //string fileName = "myfile.ext"; //string path1 = @"mydir"; //string path2 = @"\mydir"; string fullPath; CSVExtractor extractor = new CSVExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("AV42104576.pdf"); //extractor.CSVSeparatorSymbol = ","; // you can change CSV separator symbol (if needed) from "," symbol to another if needed for non-US locales extractor.SaveCSVToFile("output1.csv"); Console.WriteLine(); Console.WriteLine("Data has been extracted to 'output.csv' file."); Console.WriteLine(); Console.WriteLine("Press any key to continue and open CSV in default CSV viewer (or Excel)..."); Console.ReadKey(); Process.Start("output.csv"); //fullPath = System.IO.Path.GetFullPath(@"C:\Users\i.upadhyay\Desktop\AV42104576.pdf"); //Program n = new Program(); //n.ExportPDFToExcel(fullPath); }
/// <summary> /// Get DataTable from Document /// </summary> private static DataTable GetDataTableFromDocument(string fileName) { DataTable oDataTable = null; // Initialise table detector using (TableDetector tableDetector = new TableDetector("demo", "demo")) { using (CSVExtractor CSVExtractor = new CSVExtractor("demo", "demo")) { // Set table detection mode to "bordered tables" - best for tables with closed solid borders. tableDetector.ColumnDetectionMode = ColumnDetectionMode.BorderedTables; // We should define what kind of tables we should detect. // So we set min required number of columns to 2 ... tableDetector.DetectionMinNumberOfColumns = 2; // ... and we set min required number of rows to 2 tableDetector.DetectionMinNumberOfRows = 2; // Load PDF document tableDetector.LoadDocumentFromFile(fileName); CSVExtractor.LoadDocumentFromFile(fileName); // Get page count int pageCount = tableDetector.GetPageCount(); if (tableDetector.FindTable(0)) { // Set extraction area for CSV extractor to rectangle received from the table detector CSVExtractor.SetExtractionArea(tableDetector.FoundTableLocation); // Generate CSV data var allCsvData = CSVExtractor.GetCSV(); // Generate Datatable oDataTable = GetDataTableFromCSV(allCsvData); } } } return(oDataTable); }
private void tsbExportToCSV_Click(object sender, EventArgs e) { // Get selections from viewer RectangleF[] selections = pdfViewerControl1.SelectionInPoints; string outputFile = @".\result.csv"; using (CSVExtractor csvExtractor = new CSVExtractor("demo", "demo")) { // Load document into extractor csvExtractor.LoadDocumentFromFile(pdfViewerControl1.InputFile); // Enable OCR to recongize text from images csvExtractor.OCRMode = OCRMode.Auto; csvExtractor.OCRResolution = 300; csvExtractor.OCRLanguage = "eng"; csvExtractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; // There are double spaces between some words in your document. // To avoid such words break column structure increase the space ratio to 2. csvExtractor.DetectNewColumnBySpacesRatio = 2; // FYI, removing horizontal lines may increase the text recognition quality in some cases csvExtractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover(); // Another filter able to improve the recognition //csvExtractor.OCRImagePreprocessingFilters.AddGammaCorrection(); // If selection exists set the extraction area. // Overwise it will extract the whole page. if (selections.Length > 0) { csvExtractor.SetExtractionArea(selections[0]); } // Save extraction results to CSV files csvExtractor.SavePageCSVToFile(pdfViewerControl1.CurrentPageIndex, outputFile); } Process.Start(outputFile); }
static void Main(string[] args) { string inputDocument = Path.GetFullPath(@".\UnicodeSample.pdf"); string csvFilePath = Path.ChangeExtension(inputDocument, ".csv"); string csvFileName = Path.GetFileName(csvFilePath); string csvDirectory = Path.GetDirectoryName(Path.GetFullPath(csvFilePath)); // Create Bytescout.PDFExtractor.CSVExtractor instance using (CSVExtractor extractor = new CSVExtractor("demo", "demo")) { extractor.LoadDocumentFromFile(inputDocument); extractor.CSVSeparatorSymbol = ","; string csvText = extractor.GetCSV(); // Save csv text in UTF-8 encoding without BOM (byte order mark): File.WriteAllText(csvFilePath, csvText); } // Please Note: Target the project to x86 because Microsoft.Jet.OLEDB.4.0 driver is 32-bit only. using (OleDbConnection connection = new OleDbConnection($@"Provider=Microsoft.Jet.OLEDB.4.0;Data Source=""{csvDirectory}"";Extended Properties=""Text;FMT=$;HDR=No;CharacterSet=65001""")) { using (OleDbCommand command = new OleDbCommand($"select * from [{csvFileName}]", connection)) { using (OleDbDataAdapter adapter = new OleDbDataAdapter(command)) { DataTable table = new DataTable(); table.Locale = CultureInfo.CurrentCulture; adapter.Fill(table); Console.WriteLine($"Loaded {table.Rows.Count} lines."); } } } Console.ReadKey(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.CSVExtractor instance CSVExtractor extractor = new CSVExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample_ocr.pdf"); // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of OCR language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Set PDF document rendering resolution extractor.OCRResolution = 300; // You can also apply various preprocessing filters // to improve the recognition on low-quality scans. // Automatically deskew skewed scans //extractor.OCRImagePreprocessingFilters.AddDeskew(); // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors) //extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover(); //extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover(); // Repair broken letters //extractor.OCRImagePreprocessingFilters.AddDilate(); // Remove noise //extractor.OCRImagePreprocessingFilters.AddMedian(); // Apply Gamma Correction //extractor.OCRImagePreprocessingFilters.AddGammaCorrection(); // Add Contrast //extractor.OCRImagePreprocessingFilters.AddContrast(20); // (!) You can use new OCRAnalyser class to find an optimal set of image preprocessing // filters for your specific document. // See "OCR Analyser" example. // Save extracted text to file extractor.SaveCSVToFile("output.csv"); // Cleanup extractor.Dispose(); // Open result document in default associated application (for demo purpose) ProcessStartInfo processStartInfo = new ProcessStartInfo("output.csv"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.CSVExtractor instance CSVExtractor csvExtractor = new CSVExtractor(); csvExtractor.RegistrationName = "demo"; csvExtractor.RegistrationKey = "demo"; // Create Bytescout.PDFExtractor.TableDetector2 instance TableDetector2 tableDetector = new TableDetector2(); tableDetector.RegistrationKey = "demo"; tableDetector.RegistrationName = "demo"; // Load sample PDF document csvExtractor.LoadDocumentFromFile(@".\sample_borderless.pdf"); tableDetector.LoadDocumentFromFile(@".\sample_borderless.pdf"); // Get page count int pageCount = tableDetector.GetPageCount(); var extractedCsvFiles = new List <string>(); for (int pageIndex = 0; pageIndex < pageCount; pageIndex++) { var foundTables = tableDetector.FindTables(pageIndex).ToArray(); // Find first table and continue if found if (foundTables.Length > 0) { for (int indexTable = 0; indexTable < foundTables.Length; indexTable++) { // Set extraction area for CSV extractor to rectangle received from the table detector csvExtractor.SetExtractionArea(foundTables[indexTable].Bounds); // Result CSV file name var outputCsvName = $"page-{pageIndex + 1}-table-{indexTable + 1}.csv"; // Export the table to CSV file csvExtractor.SavePageCSVToFile(pageIndex, outputCsvName); extractedCsvFiles.Add(outputCsvName); } } } // Cleanup csvExtractor.Dispose(); tableDetector.Dispose(); // Show Summary Console.Clear(); if (extractedCsvFiles.Count > 0) { Console.WriteLine($"Total {extractedCsvFiles.Count} tables found!"); Console.WriteLine("--------------------------"); Console.WriteLine(string.Join("\n", extractedCsvFiles)); } else { Console.WriteLine("No Table Found!"); } Console.ReadLine(); }