예제 #1
0
        /// <summary>
        /// 通过oTable对象解析表名和内容
        /// 表单元格解析为一个字典:键由表的序例号,行号,当前行的列号组成,三个元素间用"_"分隔
        /// </summary>
        /// <param name="oTable"></param>
        public OCRTable(OTable oTable)
        {
            valueDict = new Dictionary <string, string>();
            var tableName = new StringBuilder();
            Dictionary <int, int> colCountDict = new Dictionary <int, int>();
            var regionIdx  = 0;
            int maxRow     = int.MinValue;
            int tableIndex = 0;

            foreach (var region in oTable.result.regions)
            {
                regionIdx++;
                if (region.type == "text")
                {
                    foreach (var blk in region.Blocks)
                    {
                        var word = blk.Words.Replace("\n", "").Replace(" ", "");
                        if (regex_ch.IsMatch(word))
                        {
                            tableName.Append(word);
                        }
                    }
                }
                else if (region.type == "table")
                {
                    tableIndex++;
                    foreach (var blk in region.Blocks)
                    {
                        var word   = blk.Words.Replace("\n", "").Replace(" ", "");
                        var curRow = 0;
                        var curCol = 0;
                        if (blk.Rows.Count > 0)
                        {
                            curRow = blk.Rows[0];
                        }
                        if (colCountDict.ContainsKey(curRow))
                        {
                            curCol = colCountDict[curRow];
                        }
                        foreach (var row in blk.Rows)
                        {
                            if (!colCountDict.ContainsKey(row))
                            {
                                colCountDict.Add(row, 0);
                            }
                            colCountDict[row]++;
                            if (maxRow < row)
                            {
                                maxRow = row;
                            }
                        }
                        valueDict.Add(string.Format("{0}_{1}_{2}", tableIndex, curRow, curCol), word);
                    }
                }
            }
            GuessTableName = tableName.ToString();
            RowCount       = maxRow;
        }
        private void processPdf(string pdfPath, System.Text.RegularExpressions.Regex regex, Dictionary <string, string> adjustStrDict, Action stepFunc)
        {
            var subFolders = Directory.GetDirectories(pdfPath);

            foreach (var subFolder in subFolders)
            {
                processPdf(subFolder, regex, adjustStrDict, stepFunc);
            }
            var    pdfFiles = Directory.GetFiles(pdfPath, "*.pdf");
            string dh;
            var    dict = getFileDict(Path.Combine(pdfPath, "目录.doc"), out dh);

            foreach (var pdf in pdfFiles)
            {
                stepFunc();
                string originName = Path.GetFileNameWithoutExtension(pdf);
                string tableName  = originName;
                if (dict.ContainsKey(tableName))
                {
                    tableName = dict[tableName];
                }
                if (regex.IsMatch(tableName))
                {
                    //将PDF转为图片列表
                    var imgs  = ConvertPdf2Image.Convert(pdf, definition: ConvertPdf2Image.Definition.Four);
                    var index = 1;
                    foreach (var img in imgs)
                    {
                        var imgBase64 = "";
                        //将图片转为base64字串,以便传递给华为文字识别API
                        using (var stream = new MemoryStream())
                        {
                            img.Save(stream, System.Drawing.Imaging.ImageFormat.Png);
                            var bytes = stream.ToArray();
                            imgBase64 = Convert.ToBase64String(bytes);
                        }
                        img.Dispose();
                        //调用华为OCR接口返回JSON解析串
                        var jsonString = OCRParser.GetTableJsonStringByBase64(imgBase64);
                        //反序列化为OTable对象
                        OTable   table    = JsonConvert.DeserializeObject <OTable>(jsonString);
                        OCRTable oCRTable = new OCRTable(table);
                        //进行解析文本校正
                        oCRTable.AdjustStringByDict(adjustStrDict);
                        //var json = JsonConvert.SerializeObject(oCRTable);
                        var ocrTableString = oCRTable.ToString();
                        //构造文本文件名
                        string txtFileName = Path.Combine(pdfPath, string.Format("{0}_{1}{2}", dh, originName, tableName));
                        if (imgs.Count > 1)
                        {
                            txtFileName += "_" + index++;
                        }
                        txtFileName += ".txt";
                        File.WriteAllText(txtFileName, ocrTableString);
                    }
                }
            }
        }