Esempio n. 1
0
        /// <summary>
        /// 通过PDF文档对象解析PDF
        /// </summary>
        /// <param name="pdfDocument">PDF文档</param>
        /// <param name="tableContainType">表格包含样式</param>
        /// <returns></returns>
        public static PDFModel Parser(PDDocument pdfDocument, TableContainType tableContainType)
        {
            ObjectExtractor extractor    = new ObjectExtractor(pdfDocument);
            PageIterator    pageIterator = extractor.extract();
            SpreadsheetExtractionAlgorithm tableExtractor = new SpreadsheetExtractionAlgorithm();

            PDFModel pdfModel = new PDFModel();

            PDFTextStripper pdfStripper = new PDFTextStripper();

            pdfStripper.setPageEnd(pageEndMark);
            //pdfStripper.setParagraphEnd(paragraphEndMark);
            string[] strs = Regex.Split(pdfStripper.getText(pdfDocument), pageEndMark, RegexOptions.IgnoreCase);
            if (strs != null && strs.Length > 0)
            {
                pdfModel.Pages = new List <PdfPageModel>();
                int cp = 0;

                while (pageIterator.hasNext())
                {
                    PdfPageModel pdfPage = new PdfPageModel();
                    pdfPage.CurrentPage = cp + 1;
                    pdfPage.Text        = strs[cp];

                    List <Table> tables     = new List <Table>();
                    Page         page       = pageIterator.next();
                    var          pageTables = tableExtractor.extract(page).toArray();
                    if (pageTables != null && pageTables.Length > 0)
                    {
                        for (int i = 0; i < pageTables.Length; i++)
                        {
                            tables.Add(pageTables[i] as Table);
                        }
                    }
                    pdfPage.Tables = tables;
                    pdfModel.Pages.Add(pdfPage);
                    cp++;
                }

                pdfModel.PageNumber = pdfModel.Pages.Count;

                return(PdfTextFormater(pdfModel, tableContainType));
            }

            return(null);
        }