/// <summary> /// 通过oTable对象解析表名和内容 /// 表单元格解析为一个字典:键由表的序例号,行号,当前行的列号组成,三个元素间用"_"分隔 /// </summary> /// <param name="oTable"></param> public OCRTable(OTable oTable) { valueDict = new Dictionary <string, string>(); var tableName = new StringBuilder(); Dictionary <int, int> colCountDict = new Dictionary <int, int>(); var regionIdx = 0; int maxRow = int.MinValue; int tableIndex = 0; foreach (var region in oTable.result.regions) { regionIdx++; if (region.type == "text") { foreach (var blk in region.Blocks) { var word = blk.Words.Replace("\n", "").Replace(" ", ""); if (regex_ch.IsMatch(word)) { tableName.Append(word); } } } else if (region.type == "table") { tableIndex++; foreach (var blk in region.Blocks) { var word = blk.Words.Replace("\n", "").Replace(" ", ""); var curRow = 0; var curCol = 0; if (blk.Rows.Count > 0) { curRow = blk.Rows[0]; } if (colCountDict.ContainsKey(curRow)) { curCol = colCountDict[curRow]; } foreach (var row in blk.Rows) { if (!colCountDict.ContainsKey(row)) { colCountDict.Add(row, 0); } colCountDict[row]++; if (maxRow < row) { maxRow = row; } } valueDict.Add(string.Format("{0}_{1}_{2}", tableIndex, curRow, curCol), word); } } } GuessTableName = tableName.ToString(); RowCount = maxRow; }
private void processPdf(string pdfPath, System.Text.RegularExpressions.Regex regex, Dictionary <string, string> adjustStrDict, Action stepFunc) { var subFolders = Directory.GetDirectories(pdfPath); foreach (var subFolder in subFolders) { processPdf(subFolder, regex, adjustStrDict, stepFunc); } var pdfFiles = Directory.GetFiles(pdfPath, "*.pdf"); string dh; var dict = getFileDict(Path.Combine(pdfPath, "目录.doc"), out dh); foreach (var pdf in pdfFiles) { stepFunc(); string originName = Path.GetFileNameWithoutExtension(pdf); string tableName = originName; if (dict.ContainsKey(tableName)) { tableName = dict[tableName]; } if (regex.IsMatch(tableName)) { //将PDF转为图片列表 var imgs = ConvertPdf2Image.Convert(pdf, definition: ConvertPdf2Image.Definition.Four); var index = 1; foreach (var img in imgs) { var imgBase64 = ""; //将图片转为base64字串,以便传递给华为文字识别API using (var stream = new MemoryStream()) { img.Save(stream, System.Drawing.Imaging.ImageFormat.Png); var bytes = stream.ToArray(); imgBase64 = Convert.ToBase64String(bytes); } img.Dispose(); //调用华为OCR接口返回JSON解析串 var jsonString = OCRParser.GetTableJsonStringByBase64(imgBase64); //反序列化为OTable对象 OTable table = JsonConvert.DeserializeObject <OTable>(jsonString); OCRTable oCRTable = new OCRTable(table); //进行解析文本校正 oCRTable.AdjustStringByDict(adjustStrDict); //var json = JsonConvert.SerializeObject(oCRTable); var ocrTableString = oCRTable.ToString(); //构造文本文件名 string txtFileName = Path.Combine(pdfPath, string.Format("{0}_{1}{2}", dh, originName, tableName)); if (imgs.Count > 1) { txtFileName += "_" + index++; } txtFileName += ".txt"; File.WriteAllText(txtFileName, ocrTableString); } } } }