private void TesseractErrorReceived(object sender, DataReceivedEventArgs e) { string msg = e.Data; if (string.IsNullOrEmpty(msg) || msg.StartsWith("Tesseract Open Source OCR Engine", StringComparison.OrdinalIgnoreCase) || msg.Contains("Too few characters", StringComparison.OrdinalIgnoreCase) || msg.Contains("Empty page", StringComparison.OrdinalIgnoreCase) || msg.Contains(" diacritics", StringComparison.OrdinalIgnoreCase) || msg.Contains("Weak margin", StringComparison.OrdinalIgnoreCase)) { return; } if (TesseractErrors.Count <= 100) { if (string.IsNullOrEmpty(LastError)) { LastError = msg; } else if (!LastError.Contains(msg)) { LastError = LastError + Environment.NewLine + msg; } TesseractErrors.Add(msg); } }
public List <string> Run(string languageCode, string psmMode, string engineMode, List <string> imageFileNames, bool run302 = false) { var dir = run302 ? Configuration.Tesseract302Directory : Configuration.TesseractDirectory; string inputFileName = Path.GetTempPath() + Guid.NewGuid() + ".txt"; var filesToDelete = new List <string>(); var sb = new StringBuilder(); foreach (var imageFileName in imageFileNames) { filesToDelete.Add(imageFileName); sb.AppendLine(imageFileName); } File.WriteAllText(inputFileName, sb.ToString()); filesToDelete.Add(inputFileName); var outputFileName = Path.GetTempPath() + Guid.NewGuid(); using (var process = new Process()) { process.StartInfo = new ProcessStartInfo(dir + "tesseract.exe") { UseShellExecute = true, Arguments = "\"" + inputFileName + "\" \"" + outputFileName + "\" -l " + languageCode }; if (!string.IsNullOrEmpty(psmMode)) { var prefix = run302 ? " -psm " : " --psm "; process.StartInfo.Arguments += prefix + psmMode; } if (!string.IsNullOrEmpty(engineMode) && !run302) { process.StartInfo.Arguments += " --oem " + engineMode; } process.StartInfo.Arguments += " hocr"; if (_runningOnWindows) { if (run302) { process.StartInfo.WorkingDirectory = Configuration.Tesseract302Directory; } else { process.ErrorDataReceived += TesseractErrorReceived; process.StartInfo.Arguments = " --tessdata-dir \"" + Path.Combine(dir, "tessdata") + "\" " + process.StartInfo.Arguments.Trim(); } } else { process.StartInfo.UseShellExecute = false; process.StartInfo.RedirectStandardError = true; process.StartInfo.FileName = "tesseract"; } process.StartInfo.WindowStyle = ProcessWindowStyle.Hidden; try { process.Start(); } catch (Exception exception) { LastError = exception.Message + Environment.NewLine + exception.StackTrace; TesseractErrors.Add(LastError); return(new List <string> { "Error!" }); } process.WaitForExit(8000 + imageFileNames.Count * 500); var result = new List <string>(); string resultFileName = outputFileName + ".html"; if (!File.Exists(outputFileName)) { resultFileName = outputFileName + ".hocr"; } filesToDelete.Add(resultFileName); try { if (File.Exists(resultFileName)) { result = ParseHocr(File.ReadAllText(resultFileName, Encoding.UTF8)); } foreach (var fileName in filesToDelete) { if (File.Exists(fileName)) { File.Delete(fileName); } } } catch { // ignored } return(result); } }
public string Run(string languageCode, string psmMode, string engineMode, string imageFileName, bool run302 = false) { LastError = null; var dir = run302 ? Configuration.Tesseract302Directory : Configuration.TesseractDirectory; string tempTextFileName = Path.GetTempPath() + Guid.NewGuid(); using (var process = new Process()) { process.StartInfo = new ProcessStartInfo(Path.Combine(dir, "tesseract.exe")) { UseShellExecute = true, Arguments = "\"" + imageFileName + "\" \"" + tempTextFileName + "\" -l " + languageCode }; if (!string.IsNullOrEmpty(psmMode)) { var prefix = run302 ? " -psm " : " --psm "; process.StartInfo.Arguments += prefix + psmMode; } if (!string.IsNullOrEmpty(engineMode) && !run302) { process.StartInfo.Arguments += " --oem " + engineMode; } process.StartInfo.Arguments += " hocr"; if (_runningOnWindows) { if (run302) { process.StartInfo.WorkingDirectory = Configuration.Tesseract302Directory; } else { process.ErrorDataReceived += TesseractErrorReceived; process.StartInfo.Arguments = " --tessdata-dir \"" + Path.Combine(dir, "tessdata") + "\" " + process.StartInfo.Arguments.Trim(); } } else { process.StartInfo.UseShellExecute = false; process.StartInfo.RedirectStandardError = true; process.StartInfo.FileName = "tesseract"; } process.StartInfo.WindowStyle = ProcessWindowStyle.Hidden; try { process.Start(); } catch (Exception exception) { LastError = exception.Message + Environment.NewLine + exception.StackTrace; TesseractErrors.Add(LastError); return("Error!"); } process.WaitForExit(8000); if (process.HasExited && process.ExitCode != 0) { LastError = "Tesseract returned with code " + process.ExitCode; TesseractErrors.Add(LastError); } } string result = string.Empty; string outputFileName = tempTextFileName + ".html"; if (!File.Exists(outputFileName)) { outputFileName = tempTextFileName + ".hocr"; } try { if (File.Exists(outputFileName)) { result = File.ReadAllText(outputFileName, Encoding.UTF8); result = ParseHocr(result); File.Delete(outputFileName); } File.Delete(imageFileName); } catch { // ignored } return(result); }