예제 #1
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.StructuredExtractor instance (former TableExtractor)
            StructuredExtractor extractor = new StructuredExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample3.pdf");

            for (int ipage = 0; ipage < extractor.GetPageCount(); ipage++)
            {
                Console.WriteLine("starting extraction from page #" + ipage);
                extractor.PrepareStructure(ipage);

                int rowCount            = extractor.GetRowCount(ipage);
                int CellsAlreadyScanned = 0;

                for (int row = 0; row < rowCount; row++)
                {
                    int columnCount = extractor.GetColumnCount(ipage, row);

                    for (int col = 0; col < columnCount; col++)
                    {
                        Console.WriteLine(extractor.GetCellValue(ipage, row, col));
                    }

                    CellsAlreadyScanned += columnCount;
                }
            }
            Console.WriteLine("Press any key..");
            Console.ReadKey();
        }
예제 #2
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.StructuredExtractor instance
            StructuredExtractor extractor = new StructuredExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\sample3.pdf");

            for (int pageIndex = 0; pageIndex < extractor.GetPageCount(); pageIndex++)
            {
                Console.WriteLine("Starting extraction from page #" + pageIndex);
                Console.WriteLine();

                extractor.PrepareStructure(pageIndex);

                int rowCount = extractor.GetRowCount(pageIndex);

                for (int row = 0; row < rowCount; row++)
                {
                    int columnCount = extractor.GetColumnCount(pageIndex, row);

                    for (int col = 0; col < columnCount; col++)
                    {
                        Console.WriteLine(extractor.GetCellValue(pageIndex, row, col));
                    }
                }
            }

            Console.WriteLine();
            Console.WriteLine("Press any key..");
            Console.ReadKey();
        }
예제 #3
0
        /// <summary>
        /// Parse Pdf and save into xlsx format
        /// </summary>
        /// <param name="file">File info for parse</param>
        /// <param name="sw">stream writer </param>
        /// <returns>File info of parsed file</returns>
        public static bool ParsePdf(FileInfo file, StreamWriter sw)
        {
            try
            {
                var    names = file.Name.Replace(".pdf", "").Split('-');
                string teamName, value, matchName = names[0], matchId = names[1];
                int    columnCount, rowCount, teamNo = 0, count = 0;

                var extractor = new StructuredExtractor
                {
                    RegistrationName = "demo",
                    RegistrationKey  = "demo"
                };

                // Load sample PDF document
                extractor.LoadDocumentFromFile(file.FullName);

                for (var pageIndex = 0; pageIndex < extractor.GetPageCount(); pageIndex++)
                {
                    Console.WriteLine("Starting extraction from page #" + pageIndex);
                    extractor.PrepareStructure(pageIndex);
                    rowCount = extractor.GetRowCount(pageIndex);

                    for (var row = 0; row < rowCount; row++)
                    {
                        if (row < 2)
                        {
                            continue;
                        }
                        try
                        {
                            teamName    = "";
                            columnCount = extractor.GetColumnCount(pageIndex, row);

                            for (var col = 0; col < columnCount; col++)
                            {
                                value = extractor.GetCellValue(pageIndex, row, col);
                                if (string.IsNullOrEmpty(value))
                                {
                                    continue;
                                }
                                if (value.Contains("(") && value.Contains(")") && col == 0)
                                {
                                    teamName = value.Substring(0, value.LastIndexOf("("));
                                    teamNo   = value.Substring(value.IndexOf("(") + 1, value.Length - (value.IndexOf(")"))).ToInt();
                                    break;
                                }

                                if (col != 0)
                                {
                                    continue;
                                }
                                while (!value.Contains("(") && !value.Contains(")") && count < 5)
                                {
                                    value += $" {extractor.GetCellValue(pageIndex, ++row, col)}";
                                    value  = value.Replace("  ", " ");
                                    count++;
                                }
                                count    = 0;
                                teamName = value.Substring(0, value.LastIndexOf("(")).Trim();
                                teamNo   = value.Substring(value.IndexOf("(") + 1, value.Length - (value.IndexOf(")"))).ToInt();
                                break;
                            }

                            Db.MatchDatas.Add(new MatchData()
                            {
                                LeagueId  = matchId.ToInt(),
                                MatchName = matchName,
                                TeamName  = teamName,
                                TeamNo    = teamNo
                            });
                            Db.SaveChanges();
                        }
                        catch (Exception e)
                        {
                            sw.WriteLine("--------------------------------------------------------------------------------------");
                            sw.WriteLine(extractor.GetCellValue(pageIndex, row, 0));
                            sw.WriteLine("Error in file roe no :" + row + "--------" + file.Name);
                            sw.WriteLine(e.GetBaseException().Message);
                            sw.WriteLine("--------------------------------------------------------------------------------------");
                        }
                    }
                }
                return(true);
            }
            catch (Exception e)
            {
                Console.WriteLine(e.GetBaseException().Message);
                sw.WriteLine(e.GetBaseException().Message);
            }
            return(false);
        }