static void Main(string[] args) { // Create Bytescout.PDFExtractor.StructuredExtractor instance StructuredExtractor extractor = new StructuredExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\sample3.pdf"); for (int pageIndex = 0; pageIndex < extractor.GetPageCount(); pageIndex++) { Console.WriteLine("Starting extraction from page #" + pageIndex); Console.WriteLine(); extractor.PrepareStructure(pageIndex); int rowCount = extractor.GetRowCount(pageIndex); for (int row = 0; row < rowCount; row++) { int columnCount = extractor.GetColumnCount(pageIndex, row); for (int col = 0; col < columnCount; col++) { Console.WriteLine(extractor.GetCellValue(pageIndex, row, col)); } } } Console.WriteLine(); Console.WriteLine("Press any key.."); Console.ReadKey(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.StructuredExtractor instance (former TableExtractor) StructuredExtractor extractor = new StructuredExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample3.pdf"); for (int ipage = 0; ipage < extractor.GetPageCount(); ipage++) { Console.WriteLine("starting extraction from page #" + ipage); extractor.PrepareStructure(ipage); int rowCount = extractor.GetRowCount(ipage); int CellsAlreadyScanned = 0; for (int row = 0; row < rowCount; row++) { int columnCount = extractor.GetColumnCount(ipage, row); for (int col = 0; col < columnCount; col++) { Console.WriteLine(extractor.GetCellValue(ipage, row, col)); } CellsAlreadyScanned += columnCount; } } Console.WriteLine("Press any key.."); Console.ReadKey(); }
static void Main(string[] args) { var allInputFiles = new string[] { "Sample_Files\\InvoiceMar.pdf", "Sample_Files\\InvoiceApr.pdf" }; // Google Sheet data to write var reqRowData = new List <RowData>(); // Set page index var pageIndex = 0; // Loop through all input files foreach (var itmFile in allInputFiles) { var invName = Path.GetFileNameWithoutExtension(itmFile); // Create TextExtractor instance using (var tableExtractor = new StructuredExtractor("demo", "demo")) { var tableExtractionArea = GetTableExtractionArea(itmFile); if (tableExtractionArea.HasValue) { // Load document from file tableExtractor.LoadDocumentFromFile(itmFile); // Set extraction area tableExtractor.SetExtractionArea(tableExtractionArea.Value); // Prepare table structure tableExtractor.PrepareStructure(pageIndex); int rowCount = tableExtractor.GetRowCount(pageIndex); // Ignoring first title row for (int row = 1; row < rowCount; row++) { // Google Sheet RowData Input var itmRowData = new RowData(); // Google Sheet CellData Input var lstCellData = new List <CellData>(); int columnCount = tableExtractor.GetColumnCount(pageIndex, row); // Add invName to CellData Collection lstCellData.Add(new CellData { UserEnteredValue = new ExtendedValue() { StringValue = invName } }); for (int col = 0; col < columnCount; col++) { // Get table cell value var tableCellValue = tableExtractor.GetCellValue(pageIndex, row, col); // Google Sheet CellData Input var itmCellData = new CellData { UserEnteredValue = new ExtendedValue() { StringValue = tableCellValue } }; // Add to CellData Collection lstCellData.Add(itmCellData); } // Add to Google Sheet RowData Request reqRowData.Add(new RowData { Values = lstCellData }); } } } } // Proceed with writing to Google Sheets UserCredential credential; // Follow article "https://developers.google.com/sheets/api/quickstart/dotnet" to generate credentials.json using (var stream = new FileStream("credentials.json", FileMode.Open, FileAccess.Read)) { // The file token.json stores the user's access and refresh tokens, and is created // automatically when the authorization flow completed for the first time. string credPath = "token.json"; credential = GoogleWebAuthorizationBroker.AuthorizeAsync( GoogleClientSecrets.Load(stream).Secrets, Scopes, "user", CancellationToken.None, new FileDataStore(credPath, true)).Result; Console.WriteLine($"Credential file saved to : {credPath}"); } // Create Google Sheets API Service var service = new SheetsService(new Google.Apis.Services.BaseClientService.Initializer() { HttpClientInitializer = credential, ApplicationName = ApplicationName }); // Fill following parameter with your google spreadsheet Id // Usually find at url of spreadsheet: https://docs.google.com/spreadsheets/d/...ThisIsSpreadsheetId.../edit var spreadsheetId = "xxxxxxSpreadsheetIdxxxxxxxxxx"; // Get exising row counts var existingRowCount = service.Spreadsheets.Values.Get(spreadsheetId, range: "A1:D").Execute().Values.Count; var lstRequsts = new List <Request>(); lstRequsts.Add(new Request { // New data to be written UpdateCells = new UpdateCellsRequest() { Range = new GridRange { StartRowIndex = existingRowCount, StartColumnIndex = 0 }, Rows = reqRowData, Fields = "*" } }); // Prepare batch update spreadsheet request var batchUpdateSpreadsheetRequest = new BatchUpdateSpreadsheetRequest() { Requests = lstRequsts }; var batchUpdateRequest = service.Spreadsheets.BatchUpdate(batchUpdateSpreadsheetRequest, spreadsheetId); // Perform google sheet updation batchUpdateRequest.Execute(); Console.WriteLine("Google spreadsheet updated!"); Console.ReadLine(); }
static void Main(string[] args) { // char to delimit cells in a row const string delimChar = ","; // char to service as decimal separator const string precChar = "."; // output filename const string outputCSVFileName = "outputTable.csv"; // Create Bytescout.PDFExtractor.StructuredExtractor object StructuredExtractor structuredExtractor = new StructuredExtractor(); // set the registration key structuredExtractor.RegistrationName = "demo"; structuredExtractor.RegistrationKey = "demo"; // Load sample PDF document structuredExtractor.LoadDocumentFromFile("Prices.pdf"); // page to process int PageIndex = 0; // prepare structure of the first page (zero index) structuredExtractor.PrepareStructure(PageIndex); // get total number of rows in a table int iRowCount = structuredExtractor.GetRowCount(PageIndex); // search for the header column // index of the row with the header int iHeaderRowIndex = -1; // iterate through rows to find the header row for (int y = 0; y < iRowCount; y++) { // checking if the very first column (with date) is empty in the row // if date is empty it means this row contains superscript values for the row below if (structuredExtractor.GetCellValue(PageIndex, y, 0).IndexOf("Date") == 0 && // if 1st column starts with "Date" structuredExtractor.GetCellValue(PageIndex, y, 1).IndexOf("Product") == 0 // if 2nd column starts with "Product" ) { // found row with the header, saving its index iHeaderRowIndex = y; break; } } // end for // save number of columns in the header (so we are using the header row as a reference for the whole table) int iHeaderColumnCount = structuredExtractor.GetColumnCount(PageIndex, iHeaderRowIndex); // writing the header to the console for (int jj = 0; jj < iHeaderColumnCount; jj++) { Console.Write(structuredExtractor.GetCellValue(PageIndex, iHeaderRowIndex, jj) + " | "); } Console.Write("\n"); // add line break // we found a header i.e. we have a table below // now reading row by row // if first cell in a row contains superscript values for the row below // we should save it and process along with the next row // array to store superscript row string[] superScriptRow = new string[iHeaderColumnCount]; bool PreviousRowWasSuperscript = false; StringBuilder finalOutput = new StringBuilder(); // now iterate through rows from row after the header until we got to the stop text (ending the table) for (int y = iHeaderRowIndex + 1; y < iRowCount; y++) { // current row string[] currentRow = new string[iHeaderColumnCount]; // fill up row array with values for (int x = 0; x < iHeaderColumnCount; x++) { currentRow[x] = structuredExtractor.GetCellValue(PageIndex, y, x); } // if no values in 1st and 2nd column for the current row then it means we have // superscript values in this row. These superscript values belong to cells below if (currentRow[0] == "" & currentRow[1] == "") { // clean the superscript row superScriptRow.Initialize(); // copy current row to superscriptRow for (int i = 0; i < iHeaderColumnCount; i++) { superScriptRow[i] = currentRow[i]; } PreviousRowWasSuperscript = true; // jump to next iteration continue; } // end if // if previous row was superscript then add it to current row if (PreviousRowWasSuperscript) { for (int i = 0; i < iHeaderColumnCount; i++) { // if previous row (with superscript values) is NOT empty then add superscript values // to values in the current row separated by decimal char if (superScriptRow[i] != "") { currentRow[i] = currentRow[i] + precChar + superScriptRow[i]; } } } // reset flag for next iteration not to use PreviousRowWasSuperscript = false; // get current row as a string StringBuilder rowString = new StringBuilder(); foreach (string cc in currentRow) { rowString.Append("\"" + cc + "\""); rowString.Append(delimChar); } // add to final output string finalOutput.AppendLine(rowString.ToString()); } // write the generated csv into the console Console.WriteLine(finalOutput.ToString()); // save the generate csv text into a file System.IO.File.WriteAllText(outputCSVFileName, finalOutput.ToString()); Console.WriteLine("Done! Press any key to exit..."); Console.ReadKey(); Process.Start(outputCSVFileName); }
static void Main(string[] args) { var allInputFiles = new string[] { "Sample_Files\\InvoiceMar.pdf", "Sample_Files\\InvoiceApr.pdf" }; var resultFileName = "output.xlsx"; // Set page index var pageIndex = 0; // Create output Spreadsheet using (var document = new Spreadsheet()) { // Add new worksheet Worksheet worksheet = document.Workbook.Worksheets.Add("Invoices"); var ws_row = 0; var ws_column = 0; // Set Header value worksheet.Cell(ws_row, ws_column).Font = new Font("Arial", 11, FontStyle.Bold); worksheet.Cell(ws_row, (ws_column++)).Value = "Invoice"; worksheet.Cell(ws_row, ws_column).Font = new Font("Arial", 11, FontStyle.Bold); worksheet.Cell(ws_row, (ws_column++)).Value = "Product Name"; worksheet.Cell(ws_row, ws_column).Font = new Font("Arial", 11, FontStyle.Bold); worksheet.Cell((ws_row++), (ws_column++)).Value = "Price"; // Loop through all input files foreach (var itmFile in allInputFiles) { var invName = Path.GetFileNameWithoutExtension(itmFile); // Create TextExtractor instance using (var tableExtractor = new StructuredExtractor("demo", "demo")) { var tableExtractionArea = GetTableExtractionArea(itmFile); if (tableExtractionArea.HasValue) { // Load document from file tableExtractor.LoadDocumentFromFile(itmFile); // Set extraction area tableExtractor.SetExtractionArea(tableExtractionArea.Value); // Prepare table structure tableExtractor.PrepareStructure(pageIndex); int rowCount = tableExtractor.GetRowCount(pageIndex); // Ignoring first title row for (int row = 1; row < rowCount; row++) { int columnCount = tableExtractor.GetColumnCount(pageIndex, row); // Write Invoice Data ws_column = 0; worksheet.Cell(ws_row, (ws_column++)).Value = invName; for (int col = 0; col < columnCount; col++) { // Get table cell value var tableCellValue = tableExtractor.GetCellValue(pageIndex, row, col); // Write to spreadsheet worksheet.Cell(ws_row, (ws_column++)).Value = tableCellValue; } // Set row pointer to next row ws_row++; } } } } // Save document document.SaveAs(resultFileName, Bytescout.Spreadsheet.Constants.SpreadsheetFormatType.XLSX); } // Open generated XLSX file in default program Process.Start(resultFileName); }
/// <summary> /// Parse Pdf and save into xlsx format /// </summary> /// <param name="file">File info for parse</param> /// <param name="sw">stream writer </param> /// <returns>File info of parsed file</returns> public static bool ParsePdf(FileInfo file, StreamWriter sw) { try { var names = file.Name.Replace(".pdf", "").Split('-'); string teamName, value, matchName = names[0], matchId = names[1]; int columnCount, rowCount, teamNo = 0, count = 0; var extractor = new StructuredExtractor { RegistrationName = "demo", RegistrationKey = "demo" }; // Load sample PDF document extractor.LoadDocumentFromFile(file.FullName); for (var pageIndex = 0; pageIndex < extractor.GetPageCount(); pageIndex++) { Console.WriteLine("Starting extraction from page #" + pageIndex); extractor.PrepareStructure(pageIndex); rowCount = extractor.GetRowCount(pageIndex); for (var row = 0; row < rowCount; row++) { if (row < 2) { continue; } try { teamName = ""; columnCount = extractor.GetColumnCount(pageIndex, row); for (var col = 0; col < columnCount; col++) { value = extractor.GetCellValue(pageIndex, row, col); if (string.IsNullOrEmpty(value)) { continue; } if (value.Contains("(") && value.Contains(")") && col == 0) { teamName = value.Substring(0, value.LastIndexOf("(")); teamNo = value.Substring(value.IndexOf("(") + 1, value.Length - (value.IndexOf(")"))).ToInt(); break; } if (col != 0) { continue; } while (!value.Contains("(") && !value.Contains(")") && count < 5) { value += $" {extractor.GetCellValue(pageIndex, ++row, col)}"; value = value.Replace(" ", " "); count++; } count = 0; teamName = value.Substring(0, value.LastIndexOf("(")).Trim(); teamNo = value.Substring(value.IndexOf("(") + 1, value.Length - (value.IndexOf(")"))).ToInt(); break; } Db.MatchDatas.Add(new MatchData() { LeagueId = matchId.ToInt(), MatchName = matchName, TeamName = teamName, TeamNo = teamNo }); Db.SaveChanges(); } catch (Exception e) { sw.WriteLine("--------------------------------------------------------------------------------------"); sw.WriteLine(extractor.GetCellValue(pageIndex, row, 0)); sw.WriteLine("Error in file roe no :" + row + "--------" + file.Name); sw.WriteLine(e.GetBaseException().Message); sw.WriteLine("--------------------------------------------------------------------------------------"); } } } return(true); } catch (Exception e) { Console.WriteLine(e.GetBaseException().Message); sw.WriteLine(e.GetBaseException().Message); } return(false); }