Example #1
0
 public void Recognize(TesseractOcrSettings settings, VintasoftImage image)
 {
     using (TesseractOcr tesseractOcr = new TesseractOcr())
     {
         tesseractOcr.Init(settings);
         tesseractOcr.SetImage(image);
         OcrPage ocrResult = tesseractOcr.Recognize();
         _editor.EditorResult(ocrResult);
     }
 }
Example #2
0
        public void CreateTesseractEngine()
        {
            string tesseractOcrDllPath = @"G:\Programm\vintasoft\Bin\TesseractOCR";

            using (TesseractOcr tesseractOcr = new TesseractOcr(tesseractOcrDllPath))
            {
                OcrLanguage          language = OcrLanguage.Russian;
                TesseractOcrSettings settings = new TesseractOcrSettings(language);
                tesseractOcr.Init(settings);

                tesseractOcr.Shutdown();
            }
        }
Example #3
0
        public static bool UpdateOcrRendition(Record record)
        {
            bool   success           = false;
            string extractedFilePath = string.Empty;
            string ocrFilePath       = string.Empty;

            try
            {
                // get a temp working location on disk
                var rootDirectory = Path.Combine(Path.GetTempPath(), "cmramble_ocr");
                if (!Directory.Exists(rootDirectory))
                {
                    Directory.CreateDirectory(rootDirectory);
                }
                // formulate file name to extract, delete if exists for some reason
                extractedFilePath = Path.Combine(rootDirectory, $"{record.Uri}.{record.Extension}");
                ocrFilePath       = Path.Combine(rootDirectory, $"{record.Uri}.txt");
                FileHelper.Delete(extractedFilePath);
                FileHelper.Delete(ocrFilePath);
                // fetch document
                Log.Debug($"Extracting Record {record.Number}: {extractedFilePath}");
                record.GetDocument(extractedFilePath, false, "OCR", string.Empty);
                // get the OCR text
                Log.Debug($"Tesseract Ocr Record {record.Number}: {extractedFilePath}");
                ocrFilePath = TesseractOcr.ExtractFromFile(extractedFilePath);
                // use record extension method that removes existing OCR rendition (if exists)
                record.AddOcrRendition(ocrFilePath);

                Log.Debug($"Saving Record {record.Number}");
                record.Save();
                Log.Debug($"Saved Record {record.Number}");
                success = true;
            }
            catch (Exception ex)
            {
                Log.Error(ex);
            }
            finally
            {
                FileHelper.Delete(extractedFilePath);
                FileHelper.Delete(ocrFilePath);
            }
            return(success);
        }
Example #4
0
        public static bool GenerateOcrRendition(Record record, RecordRendition sourceRendition)
        {
            bool   success           = false;
            string extractedFilePath = string.Empty;
            string ocrFilePath       = string.Empty;

            try
            {
                // get a temp working location on disk
                var rootDirectory = Path.Combine(Path.GetTempPath(), "cmramble_ocr");
                if (!Directory.Exists(rootDirectory))
                {
                    Directory.CreateDirectory(rootDirectory);
                }
                // formulate file name to extract, delete if exists for some reason
                extractedFilePath = Path.Combine(rootDirectory, $"{sourceRendition.Uri}.{sourceRendition.Extension}");
                ocrFilePath       = Path.Combine(rootDirectory, $"{sourceRendition.Uri}.txt");
                FileHelper.Delete(extractedFilePath);
                FileHelper.Delete(ocrFilePath);
                // fetch document
                var extract = sourceRendition.GetExtractDocument();
                extract.FileName = Path.GetFileName(extractedFilePath);
                extract.DoExtract(Path.GetDirectoryName(extractedFilePath), true, false, "");
                if (!String.IsNullOrWhiteSpace(extract.FileName) && File.Exists(extractedFilePath))
                {
                    ocrFilePath = TesseractOcr.ExtractFromFile(extractedFilePath);
                    // use record extension method that removes existing OCR rendition (if exists)
                    record.AddOcrRendition(ocrFilePath);
                    record.Save();
                    success = true;
                }
            }
            catch (Exception ex)
            {
            }
            finally
            {
                FileHelper.Delete(extractedFilePath);
                FileHelper.Delete(ocrFilePath);
            }
            return(success);
        }
Example #5
0
        static void Main(string[] args)
        {
            var stopwatch = new Stopwatch();

            stopwatch.Start();

            // Handle input arguments
            byte[] image;

            if (args.Length == 0)
            {
                Console.WriteLine($"Using file: {TestImageFilePath}");
                try
                {
                    image = File.ReadAllBytes(TestImageFilePath);
                }
                catch (Exception e)
                {
                    Console.WriteLine($"Error when loading image: {e.Message}");
                    return;
                }
            }
            else
            {
                var possibleUrl = args[0];
                Console.WriteLine($"Using url from args: {possibleUrl}");
                try
                {
                    using (var webClient = new WebClient())
                    {
                        image = webClient.DownloadData(possibleUrl);
                    }
                }
                catch (Exception e)
                {
                    Console.WriteLine($"Error when loading image from URL: {e.Message}");
                    return;
                }
            }

            // Classify
            //var classificationResults = new TensorFlowClassificator().ClassifyImage(image);
            var classificationResults = new WebClassificator().ClassifyImage(image);
            // OCR
            var text = TesseractOcr.ParseText();

            // Print
            Console.WriteLine("------------------------");
            Console.WriteLine("OBJECT CLASSIFICATION");
            Console.WriteLine("------------------------");

            foreach (var classification in classificationResults.OrderByDescending(x => x.Value))
            {
                Console.WriteLine($"{classification.Key, 15} => {Math.Round((classification.Value * 100.0), 3)}%");
            }

            Console.WriteLine("------------------------");
            Console.WriteLine("OCR WEIGHT FINDER RESUTLS");
            Console.WriteLine("------------------------");

            text = text.Trim().Replace("\r\n", " ").Replace("\n", " ");
            var regexWeightFinder = new RegexMetricWeightSubstringFinder();
            var octResult         = regexWeightFinder.FindWeightSpecifier(text);

            Console.WriteLine(octResult);

            Console.WriteLine("------------------------");
            Console.WriteLine("OCR KEYWORDS");
            Console.WriteLine("------------------------");

            var keywords = text.Split(' ')
                           .Where(word => !string.IsNullOrWhiteSpace(word))
                           .Where(word => OcrKeywords.Any(keyword => word.ToLower().Contains(keyword)));

            foreach (var keyword in keywords)
            {
                Console.WriteLine(keyword);
            }

            Console.WriteLine("------------------------");
            stopwatch.Stop();
            Console.WriteLine($"Total time: {stopwatch.Elapsed}");
        }