Ejemplo n.º 1
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.StructuredExtractor instance
            StructuredExtractor extractor = new StructuredExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\sample3.pdf");

            for (int pageIndex = 0; pageIndex < extractor.GetPageCount(); pageIndex++)
            {
                Console.WriteLine("Starting extraction from page #" + pageIndex);
                Console.WriteLine();

                extractor.PrepareStructure(pageIndex);

                int rowCount = extractor.GetRowCount(pageIndex);

                for (int row = 0; row < rowCount; row++)
                {
                    int columnCount = extractor.GetColumnCount(pageIndex, row);

                    for (int col = 0; col < columnCount; col++)
                    {
                        Console.WriteLine(extractor.GetCellValue(pageIndex, row, col));
                    }
                }
            }

            Console.WriteLine();
            Console.WriteLine("Press any key..");
            Console.ReadKey();
        }
Ejemplo n.º 2
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.StructuredExtractor instance (former TableExtractor)
            StructuredExtractor extractor = new StructuredExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample3.pdf");

            for (int ipage = 0; ipage < extractor.GetPageCount(); ipage++)
            {
                Console.WriteLine("starting extraction from page #" + ipage);
                extractor.PrepareStructure(ipage);

                int rowCount            = extractor.GetRowCount(ipage);
                int CellsAlreadyScanned = 0;

                for (int row = 0; row < rowCount; row++)
                {
                    int columnCount = extractor.GetColumnCount(ipage, row);

                    for (int col = 0; col < columnCount; col++)
                    {
                        Console.WriteLine(extractor.GetCellValue(ipage, row, col));
                    }

                    CellsAlreadyScanned += columnCount;
                }
            }
            Console.WriteLine("Press any key..");
            Console.ReadKey();
        }
Ejemplo n.º 3
0
        static void Main(string[] args)
        {
            var allInputFiles = new string[] { "Sample_Files\\InvoiceMar.pdf", "Sample_Files\\InvoiceApr.pdf" };

            // Google Sheet data to write
            var reqRowData = new List <RowData>();

            // Set page index
            var pageIndex = 0;

            // Loop through all input files
            foreach (var itmFile in allInputFiles)
            {
                var invName = Path.GetFileNameWithoutExtension(itmFile);

                // Create TextExtractor instance
                using (var tableExtractor = new StructuredExtractor("demo", "demo"))
                {
                    var tableExtractionArea = GetTableExtractionArea(itmFile);
                    if (tableExtractionArea.HasValue)
                    {
                        // Load document from file
                        tableExtractor.LoadDocumentFromFile(itmFile);

                        // Set extraction area
                        tableExtractor.SetExtractionArea(tableExtractionArea.Value);

                        // Prepare table structure
                        tableExtractor.PrepareStructure(pageIndex);

                        int rowCount = tableExtractor.GetRowCount(pageIndex);

                        // Ignoring first title row
                        for (int row = 1; row < rowCount; row++)
                        {
                            // Google Sheet RowData Input
                            var itmRowData = new RowData();

                            // Google Sheet CellData Input
                            var lstCellData = new List <CellData>();

                            int columnCount = tableExtractor.GetColumnCount(pageIndex, row);

                            // Add invName to CellData Collection
                            lstCellData.Add(new CellData {
                                UserEnteredValue = new ExtendedValue()
                                {
                                    StringValue = invName
                                }
                            });

                            for (int col = 0; col < columnCount; col++)
                            {
                                // Get table cell value
                                var tableCellValue = tableExtractor.GetCellValue(pageIndex, row, col);

                                // Google Sheet CellData Input
                                var itmCellData = new CellData {
                                    UserEnteredValue = new ExtendedValue()
                                    {
                                        StringValue = tableCellValue
                                    }
                                };

                                // Add to CellData Collection
                                lstCellData.Add(itmCellData);
                            }

                            // Add to Google Sheet RowData Request
                            reqRowData.Add(new RowData {
                                Values = lstCellData
                            });
                        }
                    }
                }
            }

            // Proceed with writing to Google Sheets
            UserCredential credential;

            // Follow article "https://developers.google.com/sheets/api/quickstart/dotnet" to generate credentials.json
            using (var stream = new FileStream("credentials.json", FileMode.Open, FileAccess.Read))
            {
                // The file token.json stores the user's access and refresh tokens, and is created
                // automatically when the authorization flow completed for the first time.
                string credPath = "token.json";
                credential = GoogleWebAuthorizationBroker.AuthorizeAsync(
                    GoogleClientSecrets.Load(stream).Secrets,
                    Scopes,
                    "user",
                    CancellationToken.None,
                    new FileDataStore(credPath, true)).Result;

                Console.WriteLine($"Credential file saved to : {credPath}");
            }

            // Create Google Sheets API Service
            var service = new SheetsService(new Google.Apis.Services.BaseClientService.Initializer()
            {
                HttpClientInitializer = credential,
                ApplicationName       = ApplicationName
            });

            // Fill following parameter with your google spreadsheet Id
            // Usually find at url of spreadsheet: https://docs.google.com/spreadsheets/d/...ThisIsSpreadsheetId.../edit
            var spreadsheetId = "xxxxxxSpreadsheetIdxxxxxxxxxx";

            // Get exising row counts
            var existingRowCount = service.Spreadsheets.Values.Get(spreadsheetId, range: "A1:D").Execute().Values.Count;

            var lstRequsts = new List <Request>();

            lstRequsts.Add(new Request
            {
                // New data to be written
                UpdateCells = new UpdateCellsRequest()
                {
                    Range = new GridRange {
                        StartRowIndex = existingRowCount, StartColumnIndex = 0
                    },
                    Rows   = reqRowData,
                    Fields = "*"
                }
            });

            // Prepare batch update spreadsheet request
            var batchUpdateSpreadsheetRequest = new BatchUpdateSpreadsheetRequest()
            {
                Requests = lstRequsts
            };
            var batchUpdateRequest = service.Spreadsheets.BatchUpdate(batchUpdateSpreadsheetRequest, spreadsheetId);

            // Perform google sheet updation
            batchUpdateRequest.Execute();

            Console.WriteLine("Google spreadsheet updated!");
            Console.ReadLine();
        }
Ejemplo n.º 4
0
        static void Main(string[] args)
        {
            // char to delimit cells in a row
            const string delimChar = ",";
            // char to service as decimal separator
            const string precChar = ".";

            // output filename
            const string outputCSVFileName = "outputTable.csv";

            // Create Bytescout.PDFExtractor.StructuredExtractor object
            StructuredExtractor structuredExtractor = new StructuredExtractor();

            // set the registration key
            structuredExtractor.RegistrationName = "demo";
            structuredExtractor.RegistrationKey  = "demo";

            // Load sample PDF document
            structuredExtractor.LoadDocumentFromFile("Prices.pdf");

            // page to process
            int PageIndex = 0;

            // prepare structure of the first page (zero index)
            structuredExtractor.PrepareStructure(PageIndex);

            // get total number of rows in a table
            int iRowCount = structuredExtractor.GetRowCount(PageIndex);

            // search for the header column

            // index of the row with the header
            int iHeaderRowIndex = -1;

            // iterate through rows to find the header row
            for (int y = 0; y < iRowCount; y++)
            {
                // checking if the very first column (with date) is empty in the row
                // if date is empty it means this row contains superscript values for the row below
                if (structuredExtractor.GetCellValue(PageIndex, y, 0).IndexOf("Date") == 0 && // if 1st column starts with "Date"
                    structuredExtractor.GetCellValue(PageIndex, y, 1).IndexOf("Product") == 0 // if 2nd column starts with "Product"
                    )
                {
                    // found row with the header, saving its index
                    iHeaderRowIndex = y;
                    break;
                }
            } // end for

            // save number of columns in the header (so we are using the header row as a reference for the whole table)
            int iHeaderColumnCount = structuredExtractor.GetColumnCount(PageIndex, iHeaderRowIndex);

            // writing the header to the console
            for (int jj = 0; jj < iHeaderColumnCount; jj++)
            {
                Console.Write(structuredExtractor.GetCellValue(PageIndex, iHeaderRowIndex, jj) + " | ");
            }
            Console.Write("\n"); // add line break


            // we found a header i.e. we have a table below
            // now reading row by row
            // if first cell in a row contains superscript values for the row below
            // we should save it and process along with the next row

            // array to store superscript row
            string[] superScriptRow            = new string[iHeaderColumnCount];
            bool     PreviousRowWasSuperscript = false;

            StringBuilder finalOutput = new StringBuilder();

            // now iterate through rows from row after the header until we got to the stop text (ending the table)
            for (int y = iHeaderRowIndex + 1; y < iRowCount; y++)
            {
                // current row
                string[] currentRow = new string[iHeaderColumnCount];

                // fill up row array with values
                for (int x = 0; x < iHeaderColumnCount; x++)
                {
                    currentRow[x] = structuredExtractor.GetCellValue(PageIndex, y, x);
                }

                // if no values in 1st and 2nd column for the current row then it means we have
                // superscript values in this row. These superscript values belong to cells below
                if (currentRow[0] == "" & currentRow[1] == "")
                {
                    // clean the superscript row
                    superScriptRow.Initialize();
                    // copy current row to superscriptRow
                    for (int i = 0; i < iHeaderColumnCount; i++)
                    {
                        superScriptRow[i] = currentRow[i];
                    }
                    PreviousRowWasSuperscript = true;
                    // jump to next iteration
                    continue;
                } // end if

                // if previous row was superscript then add it to current row
                if (PreviousRowWasSuperscript)
                {
                    for (int i = 0; i < iHeaderColumnCount; i++)
                    {
                        // if previous row (with superscript values) is NOT empty then add superscript values
                        // to values in the current row separated by decimal char
                        if (superScriptRow[i] != "")
                        {
                            currentRow[i] = currentRow[i] + precChar + superScriptRow[i];
                        }
                    }
                }

                // reset flag for next iteration not to use
                PreviousRowWasSuperscript = false;

                // get current row as a string
                StringBuilder rowString = new StringBuilder();
                foreach (string cc in currentRow)
                {
                    rowString.Append("\"" + cc + "\"");
                    rowString.Append(delimChar);
                }

                // add to final output string
                finalOutput.AppendLine(rowString.ToString());
            }

            // write the generated csv into the console
            Console.WriteLine(finalOutput.ToString());

            // save the generate csv text into a file
            System.IO.File.WriteAllText(outputCSVFileName, finalOutput.ToString());

            Console.WriteLine("Done! Press any key to exit...");
            Console.ReadKey();

            Process.Start(outputCSVFileName);
        }
Ejemplo n.º 5
0
        static void Main(string[] args)
        {
            var allInputFiles  = new string[] { "Sample_Files\\InvoiceMar.pdf", "Sample_Files\\InvoiceApr.pdf" };
            var resultFileName = "output.xlsx";

            // Set page index
            var pageIndex = 0;

            // Create output Spreadsheet
            using (var document = new Spreadsheet())
            {
                // Add new worksheet
                Worksheet worksheet = document.Workbook.Worksheets.Add("Invoices");

                var ws_row    = 0;
                var ws_column = 0;

                // Set Header value
                worksheet.Cell(ws_row, ws_column).Font      = new Font("Arial", 11, FontStyle.Bold);
                worksheet.Cell(ws_row, (ws_column++)).Value = "Invoice";

                worksheet.Cell(ws_row, ws_column).Font      = new Font("Arial", 11, FontStyle.Bold);
                worksheet.Cell(ws_row, (ws_column++)).Value = "Product Name";

                worksheet.Cell(ws_row, ws_column).Font          = new Font("Arial", 11, FontStyle.Bold);
                worksheet.Cell((ws_row++), (ws_column++)).Value = "Price";


                // Loop through all input files
                foreach (var itmFile in allInputFiles)
                {
                    var invName = Path.GetFileNameWithoutExtension(itmFile);

                    // Create TextExtractor instance
                    using (var tableExtractor = new StructuredExtractor("demo", "demo"))
                    {
                        var tableExtractionArea = GetTableExtractionArea(itmFile);
                        if (tableExtractionArea.HasValue)
                        {
                            // Load document from file
                            tableExtractor.LoadDocumentFromFile(itmFile);

                            // Set extraction area
                            tableExtractor.SetExtractionArea(tableExtractionArea.Value);

                            // Prepare table structure
                            tableExtractor.PrepareStructure(pageIndex);

                            int rowCount = tableExtractor.GetRowCount(pageIndex);

                            // Ignoring first title row
                            for (int row = 1; row < rowCount; row++)
                            {
                                int columnCount = tableExtractor.GetColumnCount(pageIndex, row);

                                // Write Invoice Data
                                ws_column = 0;
                                worksheet.Cell(ws_row, (ws_column++)).Value = invName;

                                for (int col = 0; col < columnCount; col++)
                                {
                                    // Get table cell value
                                    var tableCellValue = tableExtractor.GetCellValue(pageIndex, row, col);

                                    // Write to spreadsheet
                                    worksheet.Cell(ws_row, (ws_column++)).Value = tableCellValue;
                                }

                                // Set row pointer to next row
                                ws_row++;
                            }
                        }
                    }
                }

                // Save document
                document.SaveAs(resultFileName, Bytescout.Spreadsheet.Constants.SpreadsheetFormatType.XLSX);
            }


            // Open generated XLSX file in default program
            Process.Start(resultFileName);
        }
Ejemplo n.º 6
0
        /// <summary>
        /// Parse Pdf and save into xlsx format
        /// </summary>
        /// <param name="file">File info for parse</param>
        /// <param name="sw">stream writer </param>
        /// <returns>File info of parsed file</returns>
        public static bool ParsePdf(FileInfo file, StreamWriter sw)
        {
            try
            {
                var    names = file.Name.Replace(".pdf", "").Split('-');
                string teamName, value, matchName = names[0], matchId = names[1];
                int    columnCount, rowCount, teamNo = 0, count = 0;

                var extractor = new StructuredExtractor
                {
                    RegistrationName = "demo",
                    RegistrationKey  = "demo"
                };

                // Load sample PDF document
                extractor.LoadDocumentFromFile(file.FullName);

                for (var pageIndex = 0; pageIndex < extractor.GetPageCount(); pageIndex++)
                {
                    Console.WriteLine("Starting extraction from page #" + pageIndex);
                    extractor.PrepareStructure(pageIndex);
                    rowCount = extractor.GetRowCount(pageIndex);

                    for (var row = 0; row < rowCount; row++)
                    {
                        if (row < 2)
                        {
                            continue;
                        }
                        try
                        {
                            teamName    = "";
                            columnCount = extractor.GetColumnCount(pageIndex, row);

                            for (var col = 0; col < columnCount; col++)
                            {
                                value = extractor.GetCellValue(pageIndex, row, col);
                                if (string.IsNullOrEmpty(value))
                                {
                                    continue;
                                }
                                if (value.Contains("(") && value.Contains(")") && col == 0)
                                {
                                    teamName = value.Substring(0, value.LastIndexOf("("));
                                    teamNo   = value.Substring(value.IndexOf("(") + 1, value.Length - (value.IndexOf(")"))).ToInt();
                                    break;
                                }

                                if (col != 0)
                                {
                                    continue;
                                }
                                while (!value.Contains("(") && !value.Contains(")") && count < 5)
                                {
                                    value += $" {extractor.GetCellValue(pageIndex, ++row, col)}";
                                    value  = value.Replace("  ", " ");
                                    count++;
                                }
                                count    = 0;
                                teamName = value.Substring(0, value.LastIndexOf("(")).Trim();
                                teamNo   = value.Substring(value.IndexOf("(") + 1, value.Length - (value.IndexOf(")"))).ToInt();
                                break;
                            }

                            Db.MatchDatas.Add(new MatchData()
                            {
                                LeagueId  = matchId.ToInt(),
                                MatchName = matchName,
                                TeamName  = teamName,
                                TeamNo    = teamNo
                            });
                            Db.SaveChanges();
                        }
                        catch (Exception e)
                        {
                            sw.WriteLine("--------------------------------------------------------------------------------------");
                            sw.WriteLine(extractor.GetCellValue(pageIndex, row, 0));
                            sw.WriteLine("Error in file roe no :" + row + "--------" + file.Name);
                            sw.WriteLine(e.GetBaseException().Message);
                            sw.WriteLine("--------------------------------------------------------------------------------------");
                        }
                    }
                }
                return(true);
            }
            catch (Exception e)
            {
                Console.WriteLine(e.GetBaseException().Message);
                sw.WriteLine(e.GetBaseException().Message);
            }
            return(false);
        }