public OcrResult Ocr(string pdfPath, ResultTypeEnum resultTypeEnum, string language = "tur") { var result = new OcrResult(); var doc = PDFDoc.Open(pdfPath); if (!string.IsNullOrEmpty(doc.GetText())) { return(result); } doc.Ocr(Clock.Utils.OcrMode.Tesseract, language, WriteTextMode.Word, null); if (resultTypeEnum.HasFlag(ResultTypeEnum.OcrCompressed) || resultTypeEnum.HasFlag(ResultTypeEnum.All)) { var reader = new PdfReader(doc.PDFBytes); var stamper = new PdfStamper(reader, new FileStream(Path.Combine(Path.GetTempPath(), "Clock.hocr", Path.GetFileNameWithoutExtension(pdfPath) + "ocrCompressed" + Path.GetExtension(pdfPath)), FileMode.Create), PdfWriter.VERSION_1_5); var pageNum = reader.NumberOfPages; for (var i = 1; i <= pageNum; i++) { reader.SetPageContent(i, reader.GetPageContent(i)); } stamper.SetFullCompression(); stamper.Close(); } if (resultTypeEnum.HasFlag(ResultTypeEnum.Text) || resultTypeEnum.HasFlag(ResultTypeEnum.All)) { result.Text = doc.GetText(); } if (resultTypeEnum.HasFlag(ResultTypeEnum.OcrBest) || resultTypeEnum.HasFlag(ResultTypeEnum.All)) { result.OcredBest = doc.PDFBytes; } if (resultTypeEnum.HasFlag(ResultTypeEnum.OcrCompressed) || resultTypeEnum.HasFlag(ResultTypeEnum.All)) { result.OcredCompressed = System.IO.File.ReadAllBytes(Path.Combine(Path.GetTempPath(), "Clock.hocr", Path.GetFileNameWithoutExtension(pdfPath) + "ocrCompressed" + Path.GetExtension(pdfPath))); } _fileHelper.DeleteDirectory(Path.Combine(Path.GetTempPath(), "Clock.hocr")); return(result); }
protected override void Execute(NativeActivityContext context) { WorkflowDataContext dataContext = context.DataContext; PropertyDescriptorCollection propertyDescriptorCollection = dataContext.GetProperties(); string sessionId = string.Empty; int fileSystemItemId = 0; foreach (PropertyDescriptor propertyDesc in propertyDescriptorCollection) { if (propertyDesc.Name == "ArgSessionId") { sessionId = propertyDesc.GetValue(dataContext) as string; break; } } foreach (PropertyDescriptor propertyDesc in propertyDescriptorCollection) { if (propertyDesc.Name == "ArgFileSystemItemId") { fileSystemItemId = (int)propertyDesc.GetValue(dataContext); break; } } if (string.IsNullOrWhiteSpace(sessionId)) { LogHelper.AddGeneralLog(GeneralLogTypeEnum.ActivityError, $"SessionId is null."); return; } if (fileSystemItemId == 0) { LogHelper.AddSessionLog(SessionLogTypeEnum.ActivityError, sessionId, $"FileSystemItemId is null."); return; } FileSystemItemDto fileSystemItem = null; try { using (var sqlDbContext = new SqlDbContext()) { fileSystemItem = sqlDbContext.FileSystemItems.FirstOrDefault(x => x.Id == fileSystemItemId); } } catch (Exception ex) { LogHelper.AddSessionLog(SessionLogTypeEnum.ActivityError, sessionId, $"Exception has been thrown when getting fileSystemItem. FileSystemItemId: {fileSystemItemId}", ex); return; } var targetFile = TargetFilePathForTextFile.Get(context); int counter = 0; while (true) { counter++; if (File.Exists(targetFile)) { var directory = Path.GetDirectoryName(targetFile); var fileName = Path.GetFileNameWithoutExtension(targetFile); var extension = Path.GetExtension(targetFile); targetFile = Path.Combine(directory, fileName + counter + extension); } else { break; } } try { TempData.Instance.TempPath = Path.GetTempPath(); using (PDFDoc doc = PDFDoc.Open(fileSystemItem.FullPath)) { if (doc.GetText() == string.Empty) { doc.Ocr(OcrMode.Tesseract, "tur", WriteTextMode.Word); doc.Save(TargetFilePathForPDFFile.Get(context)); var ocrText = doc.GetText(); File.WriteAllText(TargetFilePathForTextFile.Get(context), ocrText); } } } catch (Exception ex) { LogHelper.AddFileSystemItemLog(fileSystemItemId, fileSystemItem.SourceId, sessionId, FileSystemItemLogTypeEnum.StoppedWithError, $"Exception has been thrown in OCR operation. FileSystemItemId: {fileSystemItemId}", ex); return; } LogHelper.AddFileSystemItemLog(fileSystemItem.Id, fileSystemItem.SourceId, sessionId, FileSystemItemLogTypeEnum.TesseractOcrOk); Result.Set(context, true); }