public OcrResult ProcessImage(Image image, string langCode) { string tempImageFilePath = Path.Combine(Paths.Temp, Path.GetRandomFileName()); string tempHocrFilePath = Path.Combine(Paths.Temp, Path.GetRandomFileName()); try { image.Save(tempImageFilePath); var startInfo = new ProcessStartInfo { FileName = Path.Combine(ocrDependencyManager.GetExecutableDir().FullName, "tesseract.exe"), Arguments = string.Format("\"{0}\" \"{1}\" hocr -l {2}", tempImageFilePath, tempHocrFilePath, langCode), UseShellExecute = false, CreateNoWindow = true }; var tessdataParent = ocrDependencyManager.GetLanguageDir().Parent; if (tessdataParent != null) { startInfo.EnvironmentVariables["TESSDATA_PREFIX"] = tessdataParent.FullName; } EnsureHocrConfigExists(); var tesseractProcess = Process.Start(startInfo); if (tesseractProcess == null) { // Couldn't start tesseract for some reason Log.Error("Couldn't start OCR process."); return(null); } if (!tesseractProcess.WaitForExit(TESSERACT_TIMEOUT_MS)) { Log.Error("OCR process timed out."); try { tesseractProcess.Kill(); } catch (Exception e) { Log.ErrorException("Error killing OCR process", e); } return(null); } XDocument hocrDocument = XDocument.Load(tempHocrFilePath + ".html"); return(new OcrResult { Elements = hocrDocument.Descendants() .Where(x => x.Attributes("class").Any(y => y.Value == "ocrx_word")) .Select(x => new OcrResultElement { Text = x.Value, Bounds = GetBounds(x.Attribute("title")) }) }); } catch (Exception e) { Log.ErrorException("Error running OCR", e); return(null); } finally { File.Delete(tempImageFilePath); File.Delete(tempHocrFilePath + ".html"); } }
public OcrResult ProcessImage(Image image, string langCode) { bool newTesseract = ocrDependencyManager.IsNewExecutableDownloaded; string tempImageFilePath = Path.Combine(Paths.Temp, Path.GetRandomFileName()); string tempHocrFilePath = Path.Combine(Paths.Temp, Path.GetRandomFileName()); string tempHocrFilePathWithExt = tempHocrFilePath + (newTesseract ? ".hocr" : ".html"); try { image.Save(tempImageFilePath); var exeDir = newTesseract ? ocrDependencyManager.GetExecutableDir() : ocrDependencyManager.GetOldExecutableDir(); var startInfo = new ProcessStartInfo { FileName = Path.Combine(exeDir.FullName, "tesseract.exe"), Arguments = string.Format("\"{0}\" \"{1}\" -l {2} hocr", tempImageFilePath, tempHocrFilePath, langCode), UseShellExecute = false, CreateNoWindow = true, RedirectStandardOutput = true, RedirectStandardError = true }; var tessdata = newTesseract ? ocrDependencyManager.GetLanguageDir() : ocrDependencyManager.GetOldLanguageDir(); var tessdataParent = tessdata.Parent; if (tessdataParent != null) { startInfo.EnvironmentVariables["TESSDATA_PREFIX"] = tessdataParent.FullName; } EnsureHocrConfigExists(tessdata); var tesseractProcess = Process.Start(startInfo); if (tesseractProcess == null) { // Couldn't start tesseract for some reason Log.Error("Couldn't start OCR process."); return(null); } var timeout = (int)(appConfigManager.Config.OcrTimeoutInSeconds * 1000); if (timeout == 0) { timeout = DEFAULT_TIMEOUT; } if (!tesseractProcess.WaitForExit(timeout)) { Log.Error("OCR process timed out."); try { tesseractProcess.Kill(); // Wait a bit to give the process time to release its file handles Thread.Sleep(200); } catch (Exception e) { Log.ErrorException("Error killing OCR process", e); } return(null); } #if DEBUG var output = tesseractProcess.StandardOutput.ReadToEnd(); if (output.Length > 0) { Log.Error("Tesseract stdout: {0}", output); } output = tesseractProcess.StandardError.ReadToEnd(); if (output.Length > 0) { Log.Error("Tesseract stderr: {0}", output); } #endif XDocument hocrDocument = XDocument.Load(tempHocrFilePathWithExt); return(new OcrResult { Elements = hocrDocument.Descendants() .Where(x => x.Attributes("class").Any(y => y.Value == "ocrx_word")) .Select(x => new OcrResultElement { Text = x.Value, Bounds = GetBounds(x.Attribute("title")) }) }); } catch (Exception e) { Log.ErrorException("Error running OCR", e); return(null); } finally { try { File.Delete(tempImageFilePath); File.Delete(tempHocrFilePathWithExt); } catch (Exception e) { Log.ErrorException("Error cleaning up OCR temp files", e); } } }