/// <summary> /// 通过PDF文档对象解析PDF /// </summary> /// <param name="pdfDocument">PDF文档</param> /// <param name="tableContainType">表格包含样式</param> /// <returns></returns> public static PDFModel Parser(PDDocument pdfDocument, TableContainType tableContainType) { ObjectExtractor extractor = new ObjectExtractor(pdfDocument); PageIterator pageIterator = extractor.extract(); SpreadsheetExtractionAlgorithm tableExtractor = new SpreadsheetExtractionAlgorithm(); PDFModel pdfModel = new PDFModel(); PDFTextStripper pdfStripper = new PDFTextStripper(); pdfStripper.setPageEnd(pageEndMark); //pdfStripper.setParagraphEnd(paragraphEndMark); string[] strs = Regex.Split(pdfStripper.getText(pdfDocument), pageEndMark, RegexOptions.IgnoreCase); if (strs != null && strs.Length > 0) { pdfModel.Pages = new List <PdfPageModel>(); int cp = 0; while (pageIterator.hasNext()) { PdfPageModel pdfPage = new PdfPageModel(); pdfPage.CurrentPage = cp + 1; pdfPage.Text = strs[cp]; List <Table> tables = new List <Table>(); Page page = pageIterator.next(); var pageTables = tableExtractor.extract(page).toArray(); if (pageTables != null && pageTables.Length > 0) { for (int i = 0; i < pageTables.Length; i++) { tables.Add(pageTables[i] as Table); } } pdfPage.Tables = tables; pdfModel.Pages.Add(pdfPage); cp++; } pdfModel.PageNumber = pdfModel.Pages.Count; return(PdfTextFormater(pdfModel, tableContainType)); } return(null); }
/// <summary> /// 对PDF解析出的文字进行格式化,去掉页眉,页脚,页码,可识别的表格,并尝试在对分页后的数据进行拼接; /// 对常用的指代与冒号分割的进行表格化或Json化; /// 表格提供方便显示的CSV格式载入和方便机器计算的Json加载模式; /// </summary> /// <param name="pdf">结构化后的PDF</param> /// <returns></returns> protected static PDFModel PdfTextFormater(PDFModel pdf, TableContainType tableContainType) { StringBuilder sbFileContent = new StringBuilder(); string fileContent = string.Empty; //猜测非空行的最大长度区间, 统计所有行的字数,去除小于5的部分,获取平均数作为可能的最小非换行字数 int minLineCount = 25; List <int> countList = new List <int>(); if (pdf.PageNumber > 0 && pdf.Pages != null) { //1. 循环所有的页,提取去除前后空格后的文字,使用分隔符将文字分隔为数组 Dictionary <string, ContentRemoveTag> maybeBeginText = new Dictionary <string, ContentRemoveTag>(); Dictionary <string, ContentRemoveTag> maybeEndText = new Dictionary <string, ContentRemoveTag>(); List <TagPosition> needRemovePage = new List <TagPosition>(); for (int page = 1; page <= pdf.Pages.Count; page++) { PdfPageModel pdfPageModel = pdf.Pages[page - 1]; //使用换行符拆分字符串 string[] pageTexts = lineRegex.Split(pdfPageModel.Text); List <int> tempCountList = pageTexts.Select(f => f.Replace(" ", "").Replace(" ", "").Replace("\r", "").Replace("\n", "").Length).Where(f => f > 15).ToList(); if (tempCountList != null && tempCountList.Count > 0) { countList.AddRange(tempCountList); } //获取非空字符串的前三行和后三行的文字部分 int beginGetCount = 0; int endGetCount = 0; for (int i = 0; i < pageTexts.Length; i++) { if (beginGetCount < 3) { TagPosition tagPosition = new TagPosition(); tagPosition.PageNumber = page; tagPosition.LineNumber = i; string _cleanText = pageTexts[i].Replace(" ", "").Replace(" ", "").Replace("\r", "").Replace("\n", ""); if (!string.IsNullOrWhiteSpace(_cleanText)) { int numberCount = NumberOfDigits(_cleanText); //去掉单行单个数字的行,同时去除分页前后的换行。 if (numberCount == _cleanText.Length && numberCount < 10 && numberCount >= 1) { needRemovePage.Add(tagPosition); } else { if (maybeBeginText.ContainsKey(_cleanText)) { maybeBeginText[_cleanText].OccurCount++; maybeBeginText[_cleanText].tagPositions.Add(tagPosition); } else { maybeBeginText[_cleanText] = new ContentRemoveTag(); maybeBeginText[_cleanText].Content = _cleanText; maybeBeginText[_cleanText].OccurCount++; maybeBeginText[_cleanText].tagPositions = new List <TagPosition>(); maybeBeginText[_cleanText].tagPositions.Add(tagPosition); } beginGetCount++; } } else if (beginGetCount == 0) { needRemovePage.Add(tagPosition); } } if (endGetCount < 3) { TagPosition tagPosition = new TagPosition(); tagPosition.PageNumber = page; tagPosition.LineNumber = pageTexts.Length - i - 1; string _cleanText = pageTexts[pageTexts.Length - i - 1].Replace(" ", "").Replace(" ", "").Replace("\r", "").Replace("\n", ""); if (!string.IsNullOrWhiteSpace(_cleanText)) { int numberCount = NumberOfDigits(_cleanText); //去掉单行单个数字的行,同时去除分页前后的换行。 if (numberCount == _cleanText.Length && numberCount < 10 && numberCount >= 1) { needRemovePage.Add(tagPosition); } else { if (maybeEndText.ContainsKey(_cleanText)) { maybeEndText[_cleanText].OccurCount++; maybeEndText[_cleanText].tagPositions.Add(tagPosition); } else { maybeEndText[_cleanText] = new ContentRemoveTag(); maybeEndText[_cleanText].Content = _cleanText; maybeEndText[_cleanText].OccurCount++; maybeEndText[_cleanText].tagPositions = new List <TagPosition>(); maybeEndText[_cleanText].tagPositions.Add(tagPosition); } endGetCount++; } } else if (endGetCount == 0) { needRemovePage.Add(tagPosition); } } } } //比较和记录出现的频率 foreach (var _beginItem in maybeBeginText.Values) { if (_beginItem.OccurCount > 2 && _beginItem.OccurCount >= (pdf.Pages.Count - 2)) { needRemovePage.AddRange(_beginItem.tagPositions); } } foreach (var _endItem in maybeEndText.Values) { if (_endItem.OccurCount > 2 && _endItem.OccurCount >= (pdf.Pages.Count - 2)) { needRemovePage.AddRange(_endItem.tagPositions); } } if (countList != null && countList.Count > 0) { minLineCount = Math.Min(countList.Sum() / countList.Count, 30); } //2. 对段落进行合并和返回 int currentTablePos = 0; bool isTableStarted = false; bool lastIsEnd = true; for (int page = 1; page <= pdf.Pages.Count; page++) { //处理上一页遗留的表格数据 if (isTableStarted && page > 1) { PdfPageModel lastPdfPageModel = pdf.Pages[page - 2]; if (lastPdfPageModel.Tables != null && lastPdfPageModel.Tables.Count > currentTablePos) { string lastTableStr = TableWriter.ToString(lastPdfPageModel.Tables[currentTablePos], tableContainType); //sbFileContent.AppendLine(tableStartMark); sbFileContent.AppendLine(lastTableStr.Replace("\r", "").Replace("\n", "\r\n")); //sbFileContent.AppendLine(tableEndMark); } } PdfPageModel pdfPageModel = pdf.Pages[page - 1]; string[] pageTexts = lineRegex.Split(pdfPageModel.Text); //对表格进行结构化 List <string> tableStrs = new List <string>(); if (pdfPageModel.Tables != null && pdfPageModel.Tables.Count > 0) { foreach (Table table in pdfPageModel.Tables) { try { tableStrs.Add(TableWriter.ToString(table, tableContainType)); } catch { } } } currentTablePos = 0; isTableStarted = false; //bool needCleanMenu = false; //清理需要清理的行,并进行合并 for (int i = 0; i < pageTexts.Length; i++) { //忽略页码行数据 if (needRemovePage.Any(f => f.PageNumber == page && f.LineNumber == i)) { lastIsEnd = true; continue; } string cleanText = pageTexts[i]; bool isMatchTable = false; //判断当前页的表格是否包含,存在的情况将表格列替换为表格位置标识的形式,后续替换为CSV或JSON tableGuess: if (tableStrs != null && tableStrs.Count > currentTablePos) { if (!string.IsNullOrWhiteSpace(cleanText)) { string tableStr = tableStrs[currentTablePos]; string[] words = Regex.Split(cleanText, @"[^\u4e00-\u9fa5a-zA-z0-9]+"); if (words != null && words.Length > 0) { foreach (var word in words) { if (string.IsNullOrWhiteSpace(word)) { continue; } if (tableStr.Contains(word)) { isMatchTable = true; continue; } else { isMatchTable = false; break; } } } if (isMatchTable) { isTableStarted = true; continue; } if (isTableStarted && !isMatchTable) { //sbFileContent.AppendLine(tableStartMark); sbFileContent.AppendLine(tableStr.Replace("\r", "").Replace("\n", "\r\n")); //sbFileContent.AppendLine(tableEndMark); lastIsEnd = true; isTableStarted = false; currentTablePos++; goto tableGuess; } } } //忽略目录部分的数据 string onlyText = cleanText.Replace(" ", "").Replace(" ", "").Replace("\r", "").Replace("\n", ""); if (onlyText == "目录" || onlyText.ToUpper() == "MENU") { //needCleanMenu = true; continue; } //if (needCleanMenu) { if (string.IsNullOrWhiteSpace(onlyText) || Regex.IsMatch(onlyText, @".*?(\.{6,}\s*\d+)\s*")) { continue; } //else // needCleanMenu = false; } //判断是否以正常中止标点符号结尾 bool endWithStopFlag = cleanText.EndsWith("。") || cleanText.EndsWith("!") || cleanText.EndsWith(":") || cleanText.EndsWith(";"); //判断该行是否包含正文常见标点符号 bool includeNormalFlag = Regex.IsMatch(cleanText, @"[!;,。“]"); //统计非空格字数 int _lineCount = onlyText.Length; //判断该行字数是否大于最小行字数 bool isLenThanMinLineCount = _lineCount >= minLineCount; bool firstException = false; //特例一:存在明显的排序性质的行,如 ◆,(一),■ 等 if (cleanText.StartsWith("◆") || cleanText.StartsWith("■") || cleanText.StartsWith("(")) { firstException = true; } //特例二:该行存在:的情况,较大可能是一段的开始 if (!endWithStopFlag && !includeNormalFlag && !isLenThanMinLineCount && (cleanText.Contains(":") || cleanText.Contains(":"))) { firstException = true; } //情景一:该行是一段的结尾,加上段落的文字后换行 if (!firstException && endWithStopFlag) { sbFileContent.Append(cleanText); lastIsEnd = true; } //情景二:该行是普通的一行,并未结束 else if (!firstException && !endWithStopFlag && isLenThanMinLineCount) { sbFileContent.Append(cleanText); lastIsEnd = false; } //情景三:该行是独立行 else if (lastIsEnd && (firstException || (!isLenThanMinLineCount && !endWithStopFlag && !includeNormalFlag))) { sbFileContent.AppendLine(cleanText); lastIsEnd = true; } //情景四:该行为独立的段落 else { sbFileContent.AppendLine(cleanText); lastIsEnd = true; } } } } //去掉首尾的换行 fileContent = sbFileContent.ToString().Trim('\r', '\n', ' ', '\t'); pdf.Text = fileContent; return(pdf); }