Exemplo n.º 1
0
        /// <summary>
        /// Find value within text block
        /// </summary>
        /// <param name="textBlock">Text block as string</param>
        /// <param name="keys">List of keys to search for</param>
        /// <param name="valueWhiteList">List of allowed values</param>
        /// <param name="charsToRemove">Chars to remove from a key while comparing</param>
        /// <returns>Result instance, which fits the most</returns>
        public static ExtractionResult FindInLine(string textBlock, IList <ExtractionKey> keys, IList <ExtractionValue> valueWhiteList, Func <string, bool> validateValue = null, string charsToRemove = ".:;", bool forceWhiteList = false, int minResultLenght = 3)
        {
            var values = new List <ExtractionResult>();

            // Clean keys
            foreach (var charToRemove in charsToRemove)
            {
                foreach (var key in keys)
                {
                    key.Key = key.Key.Replace(charToRemove.ToString(), "");
                }
            }

            var lines = textBlock.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);

            foreach (var line in lines)
            {
                // Split words
                var words = line.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                // Go though all words, SKIP the last one
                for (int i = 0; i < words.Length - 1; i++)
                {
                    var           word        = words[i];
                    var           cleanedWord = word;
                    var           keyMatched  = false;
                    var           similarity  = 0;
                    ExtractionKey matchedKey  = null;

                    foreach (var charToRemove in charsToRemove)
                    {
                        cleanedWord = cleanedWord.Replace(charToRemove.ToString(), "");
                    }

                    foreach (var key in keys)
                    {
                        // Compare word an key
                        similarity = LevenshteinDistance.Compute(key.Key, cleanedWord);
                        if (similarity <= 3)
                        {
                            keyMatched = true;
                            matchedKey = key;
                            break;
                        }
                    }

                    if (keyMatched)
                    {
                        // Continue line loop
                        for (i = i + 1; i < words.Length; i++)
                        {
                            var valueString = words[i];

                            if (valueString.Length < minResultLenght)
                            {
                                continue;
                            }

                            // Check value-string is not similar to a key
                            if (valueString != null && valueString.Length >= 3)
                            {
                                foreach (var key in keys)
                                {
                                    if (key.Key.Length <= 3)
                                    {
                                        continue;
                                    }

                                    // Compare value and keys
                                    var keyValueDistance = LevenshteinDistance.Compute(key.Key, valueString);
                                    if (keyValueDistance <= 3)
                                    {
                                        continue;
                                    }
                                }
                            }

                            // Validate values
                            if (validateValue == null || validateValue(valueString))
                            {
                                if (forceWhiteList && !valueWhiteList.Any(x => x.Value == valueString))
                                {
                                    continue;
                                }

                                // Add as validated value
                                if (validateValue != null)
                                {
                                    valueWhiteList.Add(new ExtractionValue {
                                        Value = valueString
                                    });
                                }

                                var value = new ExtractionResult
                                {
                                    KeyDistance   = similarity,
                                    Key           = matchedKey,
                                    Value         = valueWhiteList.FirstOrDefault(x => x.Value == valueString),
                                    OriginalValue = valueString,
                                    CleanedKey    = cleanedWord
                                };

                                value.ValueMatched = value.Value != null;

                                values.Add(value);

                                // Break loop, because a word was found
                                i = int.MaxValue - 2;
                                break;
                            }
                        }

                        // Reset matches
                        keyMatched = false;
                        similarity = 0;
                        matchedKey = null;
                    }
                }
            }

            // Return most similar value
            return(values.OrderByDescending(x => x.ValueMatched).ThenBy(x => x.KeyDistance).FirstOrDefault());
        }
Exemplo n.º 2
0
        /// <summary>
        /// Find value within text block and retrieve it from the next line
        /// </summary>
        /// <param name="textBlock">Text block as string</param>
        /// <param name="keys">List of keys to search for</param>
        /// <param name="valueWhiteList">List of allowed values</param>
        /// <param name="charsToRemove">Chars to remove from a key while comparing</param>
        /// <returns>Result instance, which fits the most</returns>
        public static ExtractionResult FindInNextLine(string textBlock, IList <ExtractionKey> keys, IList <ExtractionValue> valueWhiteList, Func <string, bool> validateValue = null, string charsToRemove = ".:;", int minSplitChars = 3, bool forceWhiteList = false)
        {
            var values = new List <ExtractionResult>();

            // Clean keys
            foreach (var charToRemove in charsToRemove)
            {
                foreach (var key in keys)
                {
                    key.Key = key.Key.Replace(charToRemove.ToString(), "");
                }
            }

            var lines = textBlock.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);
            var splittedReversedLines = new List <IList <string> >();

            foreach (var line in lines)
            {
                splittedReversedLines.Add(SplitLine(line, ' ', minSplitChars, 2).Reverse().ToList());
            }

            var           lineIndex  = 0;
            ExtractionKey matchedKey = null;
            int           distance   = 0;
            var           matches    = new List <FindNextLineMatch>();

            foreach (var line in splittedReversedLines)
            {
                int wordIndex = 0;

                foreach (var word in line)
                {
                    var cleanedWord = word;
                    foreach (var charToRemove in charsToRemove)
                    {
                        cleanedWord = cleanedWord.Replace(charToRemove.ToString(), "");
                    }

                    foreach (var key in keys)
                    {
                        distance = LevenshteinDistance.Compute(key.Key, cleanedWord);
                        if (distance <= 3)
                        {
                            matchedKey = key;
                            break;
                        }
                    }

                    if (matchedKey != null)
                    {
                        matches.Add(new FindNextLineMatch
                        {
                            Key         = matchedKey,
                            LineIndex   = lineIndex,
                            WordIndex   = wordIndex,
                            OriginalKey = cleanedWord,
                            Distance    = distance
                        });

                        matchedKey = null;
                        break;
                    }

                    wordIndex++;
                }

                lineIndex++;
            }

            foreach (var match in matches)
            {
                if (splittedReversedLines.Count > match.LineIndex + 1)
                {
                    var line = splittedReversedLines[match.LineIndex + 1];
                    if (line.Count > match.WordIndex)
                    {
                        var valueString = line[match.WordIndex];

                        if (validateValue == null || validateValue(valueString))
                        {
                            if (forceWhiteList && !valueWhiteList.Any(x => x.Value == valueString))
                            {
                                // Skip not validated match
                            }
                            else
                            {
                                // Add as validated value
                                if (validateValue != null)
                                {
                                    valueWhiteList.Add(new ExtractionValue {
                                        Value = valueString
                                    });
                                }

                                var value = new ExtractionResult
                                {
                                    KeyDistance   = match.Distance,
                                    Key           = matchedKey,
                                    Value         = valueWhiteList.FirstOrDefault(x => x.Value == valueString),
                                    OriginalValue = valueString,
                                    CleanedKey    = match.OriginalKey
                                };

                                value.ValueMatched = value.Value != null;

                                values.Add(value);
                            }
                        }
                    }
                }
            }

            // Return most similar value
            return(values.OrderByDescending(x => x.ValueMatched).ThenBy(x => x.KeyDistance).FirstOrDefault());
        }