//------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ internal static List <Dictionary <string, string> > ExtractInvoiceLineFields(string pdfPath) { List <Dictionary <string, string> > invoiceLineFields = null; // Initialise table detector using (TableDetector tableDetector = new TableDetector("demo", "demo")) { using (CSVExtractor CSVExtractor = new CSVExtractor("demo", "demo")) { // Set table detection mode to "bordered tables" - best for tables with closed solid borders. tableDetector.ColumnDetectionMode = ColumnDetectionMode.BorderedTables; // We should define what kind of tables we should detect. // So we set min required number of columns to 2 ... tableDetector.DetectionMinNumberOfColumns = 2; // ... and we set min required number of rows to 2 tableDetector.DetectionMinNumberOfRows = 1; // Load PDF document tableDetector.LoadDocumentFromFile(pdfPath); CSVExtractor.LoadDocumentFromFile(pdfPath); // Get page count int pageCount = tableDetector.GetPageCount(); if (tableDetector.FindTable(pageCount - 1)) { // Set extraction area for CSV extractor to rectangle received from the table detector CSVExtractor.SetExtractionArea(tableDetector.FoundTableLocation); // Generate CSV data var allCsvData = CSVExtractor.GetCSV(); // Generate Datatable invoiceLineFields = GetFieldsFromCSV(allCsvData); } } } return(invoiceLineFields); }
/// <summary> /// Get DataTable from Document /// </summary> private static DataTable GetDataTableFromDocument(string fileName) { DataTable oDataTable = null; // Initialise table detector using (TableDetector tableDetector = new TableDetector("demo", "demo")) { using (CSVExtractor CSVExtractor = new CSVExtractor("demo", "demo")) { // Set table detection mode to "bordered tables" - best for tables with closed solid borders. tableDetector.ColumnDetectionMode = ColumnDetectionMode.BorderedTables; // We should define what kind of tables we should detect. // So we set min required number of columns to 2 ... tableDetector.DetectionMinNumberOfColumns = 2; // ... and we set min required number of rows to 2 tableDetector.DetectionMinNumberOfRows = 2; // Load PDF document tableDetector.LoadDocumentFromFile(fileName); CSVExtractor.LoadDocumentFromFile(fileName); // Get page count int pageCount = tableDetector.GetPageCount(); if (tableDetector.FindTable(0)) { // Set extraction area for CSV extractor to rectangle received from the table detector CSVExtractor.SetExtractionArea(tableDetector.FoundTableLocation); // Generate CSV data var allCsvData = CSVExtractor.GetCSV(); // Generate Datatable oDataTable = GetDataTableFromCSV(allCsvData); } } } return(oDataTable); }
static void Main(string[] args) { try { // Generate CSVExtractor instance using (CSVExtractor extractor = new CSVExtractor("demo", "demo")) { // Load PDF document extractor.LoadDocumentFromFile("sample.pdf"); // Get all data string allData = extractor.GetCSV(); // Regular expressions and replacements string ssnRegex = @"\d{3}[-]?\d{2}[-]?\d{4}"; string ssnReplace = "***-**-****"; string phoneRegex = @"\d{3}[-]?\d{3}[-]?\d{4}"; string phoneReplace = "***-***-****"; // Find and mask SSN and phone numbers allData = Regex.Replace(allData, ssnRegex, ssnReplace); allData = Regex.Replace(allData, phoneRegex, phoneReplace); // Write as CSV File.WriteAllText("output.csv", allData); // Open file ProcessStartInfo processStartInfo = new ProcessStartInfo("output.csv"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); } } catch (Exception ex) { Console.WriteLine(ex.Message); } Console.WriteLine("Press enter key to close..."); Console.ReadLine(); }
static void Main(string[] args) { string inputDocument = Path.GetFullPath(@".\UnicodeSample.pdf"); string csvFilePath = Path.ChangeExtension(inputDocument, ".csv"); string csvFileName = Path.GetFileName(csvFilePath); string csvDirectory = Path.GetDirectoryName(Path.GetFullPath(csvFilePath)); // Create Bytescout.PDFExtractor.CSVExtractor instance using (CSVExtractor extractor = new CSVExtractor("demo", "demo")) { extractor.LoadDocumentFromFile(inputDocument); extractor.CSVSeparatorSymbol = ","; string csvText = extractor.GetCSV(); // Save csv text in UTF-8 encoding without BOM (byte order mark): File.WriteAllText(csvFilePath, csvText); } // Please Note: Target the project to x86 because Microsoft.Jet.OLEDB.4.0 driver is 32-bit only. using (OleDbConnection connection = new OleDbConnection($@"Provider=Microsoft.Jet.OLEDB.4.0;Data Source=""{csvDirectory}"";Extended Properties=""Text;FMT=$;HDR=No;CharacterSet=65001""")) { using (OleDbCommand command = new OleDbCommand($"select * from [{csvFileName}]", connection)) { using (OleDbDataAdapter adapter = new OleDbDataAdapter(command)) { DataTable table = new DataTable(); table.Locale = CultureInfo.CurrentCulture; adapter.Fill(table); Console.WriteLine($"Loaded {table.Rows.Count} lines."); } } } Console.ReadKey(); }