public IEnumerable <AnalyzedText> Analyze(OcrResult ocrResult) { var continuousText = ocrResult.AsContinuousText().ReplaceFaultyCharacters(Constants.NumericAnalysisOcrFixDictionary); var amounts = Constants.TextAnalysisConfiguration.AmountRegexes .Select(x => Regex.Matches(continuousText, x)) .SelectMany(y => y.Cast <Match>() .Select(x => x.Value)); amounts = amounts.Select(x => Regex.Replace(x, Constants.TextAnalysisConfiguration.AmountIgnoreRegex, string.Empty)); amounts = amounts.Select(x => { if ((x.Count(c => c == '.') > 0 && x.Count(c => c == ',') > 0) || x.Count(c => c == '.') > 1 || x.Count(c => c == ',') > 1) { Regex regex = new Regex("(,|\\.)"); return(regex.Replace(x, string.Empty, 1)); } return(x); }); amounts = amounts.Replace(",", "."); return(amounts.Select(x => new AnalyzedText() { Text = x, TextType = TextType.Amount.ToString(), BoundingBox = ocrResult.BoundingBox }) .Distinct()); }
private string ParseTextDefinition(TextDefinition textDefinition, OcrResult ocrResult) { string text = string.Empty; switch (textDefinition.GetAs) { case GetTextAs.Text: text = ocrResult.AsString(); break; case GetTextAs.Continuous: text = ocrResult.AsContinuousText(); break; } if (textDefinition.Replace != null && textDefinition.Replace.Length > 0) { foreach (var replaceDefinition in textDefinition.Replace) { text = ParseReplaceDefinition(replaceDefinition, text); } } return(text); }
public IEnumerable <AnalyzedText> Analyze(OcrResult ocrResult) { var possibleNames = Constants.TextAnalysisConfiguration.PercentageRegexes .Select(x => Regex.Matches(ocrResult.AsContinuousText().Replace("-", ""), x, RegexOptions.IgnoreCase)) .SelectMany(x => x.Cast <Match>()); return(possibleNames.Select(x => new AnalyzedText() { Text = x.Value, TextType = TextType.Percentage.ToString(), BoundingBox = ocrResult.BoundingBox })); }
public IEnumerable <AnalyzedText> Analyze(OcrResult ocrResult) { var continuousText = ocrResult.AsContinuousText().ReplaceFaultyCharacters(Constants.NumericAnalysisOcrFixDictionary); var continuousNumericSequences = Constants.TextAnalysisConfiguration.NumberRegexes .Select(x => Regex.Matches(continuousText.Replace("-", "").Replace(".00", "").Replace(",00", ""), x)) .SelectMany(y => y.Cast <Match>() .Select(x => (x.Value))); return(continuousNumericSequences .Select(x => new AnalyzedText() { Text = x, TextType = TextType.Number.ToString(), BoundingBox = ocrResult.BoundingBox })); }
public IEnumerable <AnalyzedText> Analyze(OcrResult ocrResult) { var possibleDates = Constants.TextAnalysisConfiguration.DateRegexes .Select(x => Regex.Matches(ocrResult.AsContinuousText(), x)) .SelectMany(y => y.Cast <Match>() .Select(x => x.Value)); return(possibleDates .Select(x => new AnalyzedText() { Text = TryParseAndFixDate(x), TextType = TextType.Date.ToString(), BoundingBox = ocrResult.BoundingBox }) .Where(x => !string.IsNullOrWhiteSpace(x.Text)) .Distinct()); }
public IEnumerable <AnalyzedText> Analyze(OcrResult ocrResult) { var firstPassPageNumbers = Constants.TextAnalysisConfiguration.FirstPassPageNumberRexeges .Select(x => Regex.Matches(ocrResult.AsContinuousText(false), x, RegexOptions.IgnoreCase)) .SelectMany(y => y.Cast <Match>() .Select(x => (x.Groups["page"]))) .FirstOrDefault() ?.Value; if (string.IsNullOrWhiteSpace(firstPassPageNumbers)) { return new AnalyzedText[] { } } ; var secondPassPageNumber = Constants.TextAnalysisConfiguration.SecondPassPageNumberRegexes .Select(x => Regex.Matches(firstPassPageNumbers, x, RegexOptions.IgnoreCase)) .SelectMany(y => y.Cast <Match>() .Select(x => (x.Groups["page"]))) .FirstOrDefault() ?.Value; if (string.IsNullOrWhiteSpace(secondPassPageNumber)) { return new AnalyzedText[] { } } ; return(new AnalyzedText[] { new AnalyzedText() { Text = secondPassPageNumber, TextType = TextType.PageNumber.ToString(), BoundingBox = ocrResult.BoundingBox } }); } } }
public IEnumerable <AnalyzedText> Analyze(OcrResult ocrResult) { var files = Directory.GetFiles((System.AppDomain.CurrentDomain.RelativeSearchPath ?? System.AppDomain.CurrentDomain.BaseDirectory) + "\\" + Constants.TextTypeDataFilePath); return(files.SelectMany(filePath => { var fileName = Path.GetFileNameWithoutExtension(filePath); var matchedResult = ocrResult.BinarySearchMatchFromFile(fileName, filePath, false); if (matchedResult.Count() == 0) { return matchedResult; } var fromFileConfiguration = Constants.FromFileConfigurations.FirstOrDefault(x => x.TextType.Equals(fileName, System.StringComparison.InvariantCultureIgnoreCase)); if (fromFileConfiguration != null) { if (!string.IsNullOrWhiteSpace(fromFileConfiguration.PostCaptureRegex)) { return matchedResult .Select(x => { var matches = new Regex(fromFileConfiguration.PostCaptureRegex.Replace("{textpart}", x.Text)).Matches(ocrResult.AsContinuousText()); return matches; }) .SelectMany(x => x.Cast <Match>() .Select(y => new AnalyzedText() { BoundingBox = ocrResult.BoundingBox, TextType = fileName, Text = $"{y.Groups["textpart"].Value} {y.Groups["numberpart"]?.Value?.PadLeft(1)}".Trim() })); } } return matchedResult; })); }