static void Main(string[] args) { // Create Bytescout.PDFExtractor.StructuredExtractor instance (former TableExtractor) StructuredExtractor extractor = new StructuredExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample3.pdf"); for (int ipage = 0; ipage < extractor.GetPageCount(); ipage++) { Console.WriteLine("starting extraction from page #" + ipage); extractor.PrepareStructure(ipage); int rowCount = extractor.GetRowCount(ipage); int CellsAlreadyScanned = 0; for (int row = 0; row < rowCount; row++) { int columnCount = extractor.GetColumnCount(ipage, row); for (int col = 0; col < columnCount; col++) { Console.WriteLine(extractor.GetCellValue(ipage, row, col)); } CellsAlreadyScanned += columnCount; } } Console.WriteLine("Press any key.."); Console.ReadKey(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.StructuredExtractor instance StructuredExtractor extractor = new StructuredExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\sample3.pdf"); for (int pageIndex = 0; pageIndex < extractor.GetPageCount(); pageIndex++) { Console.WriteLine("Starting extraction from page #" + pageIndex); Console.WriteLine(); extractor.PrepareStructure(pageIndex); int rowCount = extractor.GetRowCount(pageIndex); for (int row = 0; row < rowCount; row++) { int columnCount = extractor.GetColumnCount(pageIndex, row); for (int col = 0; col < columnCount; col++) { Console.WriteLine(extractor.GetCellValue(pageIndex, row, col)); } } } Console.WriteLine(); Console.WriteLine("Press any key.."); Console.ReadKey(); }
/// <summary> /// Parse Pdf and save into xlsx format /// </summary> /// <param name="file">File info for parse</param> /// <param name="sw">stream writer </param> /// <returns>File info of parsed file</returns> public static bool ParsePdf(FileInfo file, StreamWriter sw) { try { var names = file.Name.Replace(".pdf", "").Split('-'); string teamName, value, matchName = names[0], matchId = names[1]; int columnCount, rowCount, teamNo = 0, count = 0; var extractor = new StructuredExtractor { RegistrationName = "demo", RegistrationKey = "demo" }; // Load sample PDF document extractor.LoadDocumentFromFile(file.FullName); for (var pageIndex = 0; pageIndex < extractor.GetPageCount(); pageIndex++) { Console.WriteLine("Starting extraction from page #" + pageIndex); extractor.PrepareStructure(pageIndex); rowCount = extractor.GetRowCount(pageIndex); for (var row = 0; row < rowCount; row++) { if (row < 2) { continue; } try { teamName = ""; columnCount = extractor.GetColumnCount(pageIndex, row); for (var col = 0; col < columnCount; col++) { value = extractor.GetCellValue(pageIndex, row, col); if (string.IsNullOrEmpty(value)) { continue; } if (value.Contains("(") && value.Contains(")") && col == 0) { teamName = value.Substring(0, value.LastIndexOf("(")); teamNo = value.Substring(value.IndexOf("(") + 1, value.Length - (value.IndexOf(")"))).ToInt(); break; } if (col != 0) { continue; } while (!value.Contains("(") && !value.Contains(")") && count < 5) { value += $" {extractor.GetCellValue(pageIndex, ++row, col)}"; value = value.Replace(" ", " "); count++; } count = 0; teamName = value.Substring(0, value.LastIndexOf("(")).Trim(); teamNo = value.Substring(value.IndexOf("(") + 1, value.Length - (value.IndexOf(")"))).ToInt(); break; } Db.MatchDatas.Add(new MatchData() { LeagueId = matchId.ToInt(), MatchName = matchName, TeamName = teamName, TeamNo = teamNo }); Db.SaveChanges(); } catch (Exception e) { sw.WriteLine("--------------------------------------------------------------------------------------"); sw.WriteLine(extractor.GetCellValue(pageIndex, row, 0)); sw.WriteLine("Error in file roe no :" + row + "--------" + file.Name); sw.WriteLine(e.GetBaseException().Message); sw.WriteLine("--------------------------------------------------------------------------------------"); } } } return(true); } catch (Exception e) { Console.WriteLine(e.GetBaseException().Message); sw.WriteLine(e.GetBaseException().Message); } return(false); }