Пример #1
0
        private List <string> BuildArgs(string inputFilePath, TesseractOptions options)
        {
            var args = new List <string> {
                $"\"{inputFilePath}\""
            };

            if (options != null)
            {
                if (options.OutputBasenameFilePath != null)
                {
                    args.Add(options.OutputBasenameFilePath);
                }

                if (options.DotPerInch != null)
                {
                    args.Add($"--dpi {options.DotPerInch}");
                }

                if (options.PageSegmentationMode != null)
                {
                    args.Add($"--psm {(int)options.PageSegmentationMode}");
                }

                if (options.OcrEngineMode != null)
                {
                    args.Add($"--oem {(int)options.OcrEngineMode}");
                }

                if (options.Languages != null)
                {
                    args.Add("-l " + string.Join("+", options.Languages.Select(l => l.ToName())));
                }

                if (options.ConfigVars != null)
                {
                    args.Add(string.Join(" ", options.ConfigVars.Select(kv => $"-c {kv.Key}={kv.Value}")));
                }

                if (options.ConfigFiles != null)
                {
                    args.Add(string.Join(" ", options.ConfigFiles.Select(cf => cf.ToName())));
                }
            }

            return(args);
        }
Пример #2
0
        public string PdfToText(byte[] pdfData, TesseractOptions options)
        {
            string imageFilePath = string.Empty;

            try
            {
                imageFilePath = this.pdfToBitmapConverter.Convert(pdfData);
                return(this.FileToText(imageFilePath, options));
            }
            finally
            {
                if (!string.IsNullOrWhiteSpace(imageFilePath) && File.Exists(imageFilePath))
                {
                    File.Delete(imageFilePath);
                }
            }
        }
Пример #3
0
        public ProcessResult Process(string inputFilePath, TesseractOptions options)
        {
            if (!File.Exists(inputFilePath))
            {
                throw new ArgumentException($"Input file '{inputFilePath}' does not exit.");
            }

            if (!this.resourcesManager.TryGetDirectory(TesseractData, out var tesseractData))
            {
                throw new InvalidOperationException($"'{TesseractData}' directory not found.");
            }

            var cmd  = "tesseract";
            var args = this.BuildArgs(inputFilePath, options);
            var environmentVariables = new List <KeyValuePair <string, string> >
            {
                new KeyValuePair <string, string>("TESSDATA_PREFIX", tesseractData)
            };

            return(this.processRunner.RunProcess(cmd, args, environmentVariables: environmentVariables));
        }
Пример #4
0
        public string FileToText(string inputFilePath, TesseractOptions options)
        {
            options.ConfigFiles = new List <ConfigFile> {
                ConfigFile.OutputTxt
            };

            ProcessResult processResult;

            try
            {
                processResult = this.tessaractEngine.Process(inputFilePath, options);
            }
            catch (Exception ex)
            {
                throw new TesseractException("Fail to call tesseract", ex);
            }

            if (processResult.ExitCode != 0)
            {
                throw new InvalidOperationException(processResult.Error);
            }

            return(processResult.Output);
        }