public void Recognize(TesseractOcrSettings settings, VintasoftImage image) { using (TesseractOcr tesseractOcr = new TesseractOcr()) { tesseractOcr.Init(settings); tesseractOcr.SetImage(image); OcrPage ocrResult = tesseractOcr.Recognize(); _editor.EditorResult(ocrResult); } }
public void CreateTesseractEngine() { string tesseractOcrDllPath = @"G:\Programm\vintasoft\Bin\TesseractOCR"; using (TesseractOcr tesseractOcr = new TesseractOcr(tesseractOcrDllPath)) { OcrLanguage language = OcrLanguage.Russian; TesseractOcrSettings settings = new TesseractOcrSettings(language); tesseractOcr.Init(settings); tesseractOcr.Shutdown(); } }
public static bool UpdateOcrRendition(Record record) { bool success = false; string extractedFilePath = string.Empty; string ocrFilePath = string.Empty; try { // get a temp working location on disk var rootDirectory = Path.Combine(Path.GetTempPath(), "cmramble_ocr"); if (!Directory.Exists(rootDirectory)) { Directory.CreateDirectory(rootDirectory); } // formulate file name to extract, delete if exists for some reason extractedFilePath = Path.Combine(rootDirectory, $"{record.Uri}.{record.Extension}"); ocrFilePath = Path.Combine(rootDirectory, $"{record.Uri}.txt"); FileHelper.Delete(extractedFilePath); FileHelper.Delete(ocrFilePath); // fetch document Log.Debug($"Extracting Record {record.Number}: {extractedFilePath}"); record.GetDocument(extractedFilePath, false, "OCR", string.Empty); // get the OCR text Log.Debug($"Tesseract Ocr Record {record.Number}: {extractedFilePath}"); ocrFilePath = TesseractOcr.ExtractFromFile(extractedFilePath); // use record extension method that removes existing OCR rendition (if exists) record.AddOcrRendition(ocrFilePath); Log.Debug($"Saving Record {record.Number}"); record.Save(); Log.Debug($"Saved Record {record.Number}"); success = true; } catch (Exception ex) { Log.Error(ex); } finally { FileHelper.Delete(extractedFilePath); FileHelper.Delete(ocrFilePath); } return(success); }
public static bool GenerateOcrRendition(Record record, RecordRendition sourceRendition) { bool success = false; string extractedFilePath = string.Empty; string ocrFilePath = string.Empty; try { // get a temp working location on disk var rootDirectory = Path.Combine(Path.GetTempPath(), "cmramble_ocr"); if (!Directory.Exists(rootDirectory)) { Directory.CreateDirectory(rootDirectory); } // formulate file name to extract, delete if exists for some reason extractedFilePath = Path.Combine(rootDirectory, $"{sourceRendition.Uri}.{sourceRendition.Extension}"); ocrFilePath = Path.Combine(rootDirectory, $"{sourceRendition.Uri}.txt"); FileHelper.Delete(extractedFilePath); FileHelper.Delete(ocrFilePath); // fetch document var extract = sourceRendition.GetExtractDocument(); extract.FileName = Path.GetFileName(extractedFilePath); extract.DoExtract(Path.GetDirectoryName(extractedFilePath), true, false, ""); if (!String.IsNullOrWhiteSpace(extract.FileName) && File.Exists(extractedFilePath)) { ocrFilePath = TesseractOcr.ExtractFromFile(extractedFilePath); // use record extension method that removes existing OCR rendition (if exists) record.AddOcrRendition(ocrFilePath); record.Save(); success = true; } } catch (Exception ex) { } finally { FileHelper.Delete(extractedFilePath); FileHelper.Delete(ocrFilePath); } return(success); }
static void Main(string[] args) { var stopwatch = new Stopwatch(); stopwatch.Start(); // Handle input arguments byte[] image; if (args.Length == 0) { Console.WriteLine($"Using file: {TestImageFilePath}"); try { image = File.ReadAllBytes(TestImageFilePath); } catch (Exception e) { Console.WriteLine($"Error when loading image: {e.Message}"); return; } } else { var possibleUrl = args[0]; Console.WriteLine($"Using url from args: {possibleUrl}"); try { using (var webClient = new WebClient()) { image = webClient.DownloadData(possibleUrl); } } catch (Exception e) { Console.WriteLine($"Error when loading image from URL: {e.Message}"); return; } } // Classify //var classificationResults = new TensorFlowClassificator().ClassifyImage(image); var classificationResults = new WebClassificator().ClassifyImage(image); // OCR var text = TesseractOcr.ParseText(); // Print Console.WriteLine("------------------------"); Console.WriteLine("OBJECT CLASSIFICATION"); Console.WriteLine("------------------------"); foreach (var classification in classificationResults.OrderByDescending(x => x.Value)) { Console.WriteLine($"{classification.Key, 15} => {Math.Round((classification.Value * 100.0), 3)}%"); } Console.WriteLine("------------------------"); Console.WriteLine("OCR WEIGHT FINDER RESUTLS"); Console.WriteLine("------------------------"); text = text.Trim().Replace("\r\n", " ").Replace("\n", " "); var regexWeightFinder = new RegexMetricWeightSubstringFinder(); var octResult = regexWeightFinder.FindWeightSpecifier(text); Console.WriteLine(octResult); Console.WriteLine("------------------------"); Console.WriteLine("OCR KEYWORDS"); Console.WriteLine("------------------------"); var keywords = text.Split(' ') .Where(word => !string.IsNullOrWhiteSpace(word)) .Where(word => OcrKeywords.Any(keyword => word.ToLower().Contains(keyword))); foreach (var keyword in keywords) { Console.WriteLine(keyword); } Console.WriteLine("------------------------"); stopwatch.Stop(); Console.WriteLine($"Total time: {stopwatch.Elapsed}"); }