示例#1
0
        private void TesseractErrorReceived(object sender, DataReceivedEventArgs e)
        {
            string msg = e.Data;

            if (string.IsNullOrEmpty(msg) ||
                msg.StartsWith("Tesseract Open Source OCR Engine", StringComparison.OrdinalIgnoreCase) ||
                msg.Contains("Too few characters", StringComparison.OrdinalIgnoreCase) ||
                msg.Contains("Empty page", StringComparison.OrdinalIgnoreCase) ||
                msg.Contains(" diacritics", StringComparison.OrdinalIgnoreCase) ||
                msg.Contains("Weak margin", StringComparison.OrdinalIgnoreCase))
            {
                return;
            }

            if (TesseractErrors.Count <= 100)
            {
                if (string.IsNullOrEmpty(LastError))
                {
                    LastError = msg;
                }
                else if (!LastError.Contains(msg))
                {
                    LastError = LastError + Environment.NewLine + msg;
                }
                TesseractErrors.Add(msg);
            }
        }
示例#2
0
        public List <string> Run(string languageCode, string psmMode, string engineMode, List <string> imageFileNames, bool run302 = false)
        {
            var    dir           = run302 ? Configuration.Tesseract302Directory : Configuration.TesseractDirectory;
            string inputFileName = Path.GetTempPath() + Guid.NewGuid() + ".txt";
            var    filesToDelete = new List <string>();
            var    sb            = new StringBuilder();

            foreach (var imageFileName in imageFileNames)
            {
                filesToDelete.Add(imageFileName);
                sb.AppendLine(imageFileName);
            }

            File.WriteAllText(inputFileName, sb.ToString());
            filesToDelete.Add(inputFileName);
            var outputFileName = Path.GetTempPath() + Guid.NewGuid();

            using (var process = new Process())
            {
                process.StartInfo = new ProcessStartInfo(dir + "tesseract.exe")
                {
                    UseShellExecute = true,
                    Arguments       = "\"" + inputFileName + "\" \"" + outputFileName + "\" -l " + languageCode
                };

                if (!string.IsNullOrEmpty(psmMode))
                {
                    var prefix = run302 ? " -psm " : " --psm ";
                    process.StartInfo.Arguments += prefix + psmMode;
                }

                if (!string.IsNullOrEmpty(engineMode) && !run302)
                {
                    process.StartInfo.Arguments += " --oem " + engineMode;
                }

                process.StartInfo.Arguments += " hocr";

                if (_runningOnWindows)
                {
                    if (run302)
                    {
                        process.StartInfo.WorkingDirectory = Configuration.Tesseract302Directory;
                    }
                    else
                    {
                        process.ErrorDataReceived  += TesseractErrorReceived;
                        process.StartInfo.Arguments = " --tessdata-dir \"" + Path.Combine(dir, "tessdata") + "\" " + process.StartInfo.Arguments.Trim();
                    }
                }
                else
                {
                    process.StartInfo.UseShellExecute       = false;
                    process.StartInfo.RedirectStandardError = true;
                    process.StartInfo.FileName = "tesseract";
                }

                process.StartInfo.WindowStyle = ProcessWindowStyle.Hidden;
                try
                {
                    process.Start();
                }
                catch (Exception exception)
                {
                    LastError = exception.Message + Environment.NewLine + exception.StackTrace;
                    TesseractErrors.Add(LastError);
                    return(new List <string> {
                        "Error!"
                    });
                }

                process.WaitForExit(8000 + imageFileNames.Count * 500);

                var    result         = new List <string>();
                string resultFileName = outputFileName + ".html";
                if (!File.Exists(outputFileName))
                {
                    resultFileName = outputFileName + ".hocr";
                }

                filesToDelete.Add(resultFileName);
                try
                {
                    if (File.Exists(resultFileName))
                    {
                        result = ParseHocr(File.ReadAllText(resultFileName, Encoding.UTF8));
                    }
                    foreach (var fileName in filesToDelete)
                    {
                        if (File.Exists(fileName))
                        {
                            File.Delete(fileName);
                        }
                    }
                }
                catch
                {
                    // ignored
                }

                return(result);
            }
        }
        public string Run(string languageCode, string psmMode, string engineMode, string imageFileName, bool run302 = false)
        {
            LastError = null;
            var    dir = run302 ? Configuration.Tesseract302Directory : Configuration.TesseractDirectory;
            string tempTextFileName = Path.GetTempPath() + Guid.NewGuid();

            using (var process = new Process())
            {
                process.StartInfo = new ProcessStartInfo(Path.Combine(dir, "tesseract.exe"))
                {
                    UseShellExecute = true,
                    Arguments       = "\"" + imageFileName + "\" \"" + tempTextFileName + "\" -l " + languageCode
                };

                if (!string.IsNullOrEmpty(psmMode))
                {
                    var prefix = run302 ? " -psm " : " --psm ";
                    process.StartInfo.Arguments += prefix + psmMode;
                }

                if (!string.IsNullOrEmpty(engineMode) && !run302)
                {
                    process.StartInfo.Arguments += " --oem " + engineMode;
                }

                process.StartInfo.Arguments += " hocr";

                if (_runningOnWindows)
                {
                    if (run302)
                    {
                        process.StartInfo.WorkingDirectory = Configuration.Tesseract302Directory;
                    }
                    else
                    {
                        process.ErrorDataReceived  += TesseractErrorReceived;
                        process.StartInfo.Arguments = " --tessdata-dir \"" + Path.Combine(dir, "tessdata") + "\" " + process.StartInfo.Arguments.Trim();
                    }
                }
                else
                {
                    process.StartInfo.UseShellExecute       = false;
                    process.StartInfo.RedirectStandardError = true;
                    process.StartInfo.FileName = "tesseract";
                }

                process.StartInfo.WindowStyle = ProcessWindowStyle.Hidden;
                try
                {
                    process.Start();
                }
                catch (Exception exception)
                {
                    LastError = exception.Message + Environment.NewLine + exception.StackTrace;
                    TesseractErrors.Add(LastError);
                    return("Error!");
                }
                process.WaitForExit(8000);

                if (process.HasExited && process.ExitCode != 0)
                {
                    LastError = "Tesseract returned with code " + process.ExitCode;
                    TesseractErrors.Add(LastError);
                }
            }

            string result         = string.Empty;
            string outputFileName = tempTextFileName + ".html";

            if (!File.Exists(outputFileName))
            {
                outputFileName = tempTextFileName + ".hocr";
            }

            try
            {
                if (File.Exists(outputFileName))
                {
                    result = File.ReadAllText(outputFileName, Encoding.UTF8);
                    result = ParseHocr(result);
                    File.Delete(outputFileName);
                }
                File.Delete(imageFileName);
            }
            catch
            {
                // ignored
            }

            return(result);
        }