public IEnumerable <AnalyzedText> Analyze(OcrResult ocrResult)
        {
            var continuousText = ocrResult.AsContinuousText().ReplaceFaultyCharacters(Constants.NumericAnalysisOcrFixDictionary);

            var amounts = Constants.TextAnalysisConfiguration.AmountRegexes
                          .Select(x => Regex.Matches(continuousText, x))
                          .SelectMany(y => y.Cast <Match>()
                                      .Select(x => x.Value));

            amounts = amounts.Select(x => Regex.Replace(x, Constants.TextAnalysisConfiguration.AmountIgnoreRegex, string.Empty));

            amounts = amounts.Select(x =>
            {
                if ((x.Count(c => c == '.') > 0 && x.Count(c => c == ',') > 0) || x.Count(c => c == '.') > 1 || x.Count(c => c == ',') > 1)
                {
                    Regex regex = new Regex("(,|\\.)");
                    return(regex.Replace(x, string.Empty, 1));
                }

                return(x);
            });

            amounts = amounts.Replace(",", ".");

            return(amounts.Select(x => new AnalyzedText()
            {
                Text = x,
                TextType = TextType.Amount.ToString(),
                BoundingBox = ocrResult.BoundingBox
            })
                   .Distinct());
        }
        private string ParseTextDefinition(TextDefinition textDefinition, OcrResult ocrResult)
        {
            string text = string.Empty;

            switch (textDefinition.GetAs)
            {
            case GetTextAs.Text:
                text = ocrResult.AsString();
                break;

            case GetTextAs.Continuous:
                text = ocrResult.AsContinuousText();
                break;
            }

            if (textDefinition.Replace != null && textDefinition.Replace.Length > 0)
            {
                foreach (var replaceDefinition in textDefinition.Replace)
                {
                    text = ParseReplaceDefinition(replaceDefinition, text);
                }
            }

            return(text);
        }
Exemplo n.º 3
0
        public IEnumerable <AnalyzedText> Analyze(OcrResult ocrResult)
        {
            var possibleNames = Constants.TextAnalysisConfiguration.PercentageRegexes
                                .Select(x => Regex.Matches(ocrResult.AsContinuousText().Replace("-", ""), x, RegexOptions.IgnoreCase))
                                .SelectMany(x => x.Cast <Match>());

            return(possibleNames.Select(x => new AnalyzedText()
            {
                Text = x.Value,
                TextType = TextType.Percentage.ToString(),
                BoundingBox = ocrResult.BoundingBox
            }));
        }
        public IEnumerable <AnalyzedText> Analyze(OcrResult ocrResult)
        {
            var continuousText = ocrResult.AsContinuousText().ReplaceFaultyCharacters(Constants.NumericAnalysisOcrFixDictionary);

            var continuousNumericSequences = Constants.TextAnalysisConfiguration.NumberRegexes
                                             .Select(x => Regex.Matches(continuousText.Replace("-", "").Replace(".00", "").Replace(",00", ""), x))
                                             .SelectMany(y => y.Cast <Match>()
                                                         .Select(x => (x.Value)));

            return(continuousNumericSequences
                   .Select(x => new AnalyzedText()
            {
                Text = x,
                TextType = TextType.Number.ToString(),
                BoundingBox = ocrResult.BoundingBox
            }));
        }
Exemplo n.º 5
0
        public IEnumerable <AnalyzedText> Analyze(OcrResult ocrResult)
        {
            var possibleDates = Constants.TextAnalysisConfiguration.DateRegexes
                                .Select(x => Regex.Matches(ocrResult.AsContinuousText(), x))
                                .SelectMany(y => y.Cast <Match>()
                                            .Select(x => x.Value));

            return(possibleDates
                   .Select(x => new AnalyzedText()
            {
                Text = TryParseAndFixDate(x),
                TextType = TextType.Date.ToString(),
                BoundingBox = ocrResult.BoundingBox
            })
                   .Where(x => !string.IsNullOrWhiteSpace(x.Text))
                   .Distinct());
        }
        public IEnumerable <AnalyzedText> Analyze(OcrResult ocrResult)
        {
            var firstPassPageNumbers = Constants.TextAnalysisConfiguration.FirstPassPageNumberRexeges
                                       .Select(x => Regex.Matches(ocrResult.AsContinuousText(false), x, RegexOptions.IgnoreCase))
                                       .SelectMany(y => y.Cast <Match>()
                                                   .Select(x => (x.Groups["page"])))
                                       .FirstOrDefault()
                                       ?.Value;

            if (string.IsNullOrWhiteSpace(firstPassPageNumbers))
            {
                return new AnalyzedText[] { }
            }
            ;

            var secondPassPageNumber = Constants.TextAnalysisConfiguration.SecondPassPageNumberRegexes
                                       .Select(x => Regex.Matches(firstPassPageNumbers, x, RegexOptions.IgnoreCase))
                                       .SelectMany(y => y.Cast <Match>()
                                                   .Select(x => (x.Groups["page"])))
                                       .FirstOrDefault()
                                       ?.Value;

            if (string.IsNullOrWhiteSpace(secondPassPageNumber))
            {
                return new AnalyzedText[] { }
            }
            ;

            return(new AnalyzedText[]
            {
                new AnalyzedText()
                {
                    Text = secondPassPageNumber,
                    TextType = TextType.PageNumber.ToString(),
                    BoundingBox = ocrResult.BoundingBox
                }
            });
        }
    }
}
Exemplo n.º 7
0
        public IEnumerable <AnalyzedText> Analyze(OcrResult ocrResult)
        {
            var files = Directory.GetFiles((System.AppDomain.CurrentDomain.RelativeSearchPath ?? System.AppDomain.CurrentDomain.BaseDirectory) + "\\" + Constants.TextTypeDataFilePath);

            return(files.SelectMany(filePath =>
            {
                var fileName = Path.GetFileNameWithoutExtension(filePath);

                var matchedResult = ocrResult.BinarySearchMatchFromFile(fileName, filePath, false);
                if (matchedResult.Count() == 0)
                {
                    return matchedResult;
                }

                var fromFileConfiguration = Constants.FromFileConfigurations.FirstOrDefault(x => x.TextType.Equals(fileName, System.StringComparison.InvariantCultureIgnoreCase));
                if (fromFileConfiguration != null)
                {
                    if (!string.IsNullOrWhiteSpace(fromFileConfiguration.PostCaptureRegex))
                    {
                        return matchedResult
                        .Select(x =>
                        {
                            var matches = new Regex(fromFileConfiguration.PostCaptureRegex.Replace("{textpart}", x.Text)).Matches(ocrResult.AsContinuousText());

                            return matches;
                        })
                        .SelectMany(x => x.Cast <Match>()
                                    .Select(y => new AnalyzedText()
                        {
                            BoundingBox = ocrResult.BoundingBox,
                            TextType = fileName,
                            Text = $"{y.Groups["textpart"].Value} {y.Groups["numberpart"]?.Value?.PadLeft(1)}".Trim()
                        }));
                    }
                }

                return matchedResult;
            }));
        }