/// <summary> /// Find value within text block /// </summary> /// <param name="textBlock">Text block as string</param> /// <param name="keys">List of keys to search for</param> /// <param name="valueWhiteList">List of allowed values</param> /// <param name="charsToRemove">Chars to remove from a key while comparing</param> /// <returns>Result instance, which fits the most</returns> public static ExtractionResult FindInLine(string textBlock, IList <ExtractionKey> keys, IList <ExtractionValue> valueWhiteList, Func <string, bool> validateValue = null, string charsToRemove = ".:;", bool forceWhiteList = false, int minResultLenght = 3) { var values = new List <ExtractionResult>(); // Clean keys foreach (var charToRemove in charsToRemove) { foreach (var key in keys) { key.Key = key.Key.Replace(charToRemove.ToString(), ""); } } var lines = textBlock.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries); foreach (var line in lines) { // Split words var words = line.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); // Go though all words, SKIP the last one for (int i = 0; i < words.Length - 1; i++) { var word = words[i]; var cleanedWord = word; var keyMatched = false; var similarity = 0; ExtractionKey matchedKey = null; foreach (var charToRemove in charsToRemove) { cleanedWord = cleanedWord.Replace(charToRemove.ToString(), ""); } foreach (var key in keys) { // Compare word an key similarity = LevenshteinDistance.Compute(key.Key, cleanedWord); if (similarity <= 3) { keyMatched = true; matchedKey = key; break; } } if (keyMatched) { // Continue line loop for (i = i + 1; i < words.Length; i++) { var valueString = words[i]; if (valueString.Length < minResultLenght) { continue; } // Check value-string is not similar to a key if (valueString != null && valueString.Length >= 3) { foreach (var key in keys) { if (key.Key.Length <= 3) { continue; } // Compare value and keys var keyValueDistance = LevenshteinDistance.Compute(key.Key, valueString); if (keyValueDistance <= 3) { continue; } } } // Validate values if (validateValue == null || validateValue(valueString)) { if (forceWhiteList && !valueWhiteList.Any(x => x.Value == valueString)) { continue; } // Add as validated value if (validateValue != null) { valueWhiteList.Add(new ExtractionValue { Value = valueString }); } var value = new ExtractionResult { KeyDistance = similarity, Key = matchedKey, Value = valueWhiteList.FirstOrDefault(x => x.Value == valueString), OriginalValue = valueString, CleanedKey = cleanedWord }; value.ValueMatched = value.Value != null; values.Add(value); // Break loop, because a word was found i = int.MaxValue - 2; break; } } // Reset matches keyMatched = false; similarity = 0; matchedKey = null; } } } // Return most similar value return(values.OrderByDescending(x => x.ValueMatched).ThenBy(x => x.KeyDistance).FirstOrDefault()); }
/// <summary> /// Find value within text block and retrieve it from the next line /// </summary> /// <param name="textBlock">Text block as string</param> /// <param name="keys">List of keys to search for</param> /// <param name="valueWhiteList">List of allowed values</param> /// <param name="charsToRemove">Chars to remove from a key while comparing</param> /// <returns>Result instance, which fits the most</returns> public static ExtractionResult FindInNextLine(string textBlock, IList <ExtractionKey> keys, IList <ExtractionValue> valueWhiteList, Func <string, bool> validateValue = null, string charsToRemove = ".:;", int minSplitChars = 3, bool forceWhiteList = false) { var values = new List <ExtractionResult>(); // Clean keys foreach (var charToRemove in charsToRemove) { foreach (var key in keys) { key.Key = key.Key.Replace(charToRemove.ToString(), ""); } } var lines = textBlock.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries); var splittedReversedLines = new List <IList <string> >(); foreach (var line in lines) { splittedReversedLines.Add(SplitLine(line, ' ', minSplitChars, 2).Reverse().ToList()); } var lineIndex = 0; ExtractionKey matchedKey = null; int distance = 0; var matches = new List <FindNextLineMatch>(); foreach (var line in splittedReversedLines) { int wordIndex = 0; foreach (var word in line) { var cleanedWord = word; foreach (var charToRemove in charsToRemove) { cleanedWord = cleanedWord.Replace(charToRemove.ToString(), ""); } foreach (var key in keys) { distance = LevenshteinDistance.Compute(key.Key, cleanedWord); if (distance <= 3) { matchedKey = key; break; } } if (matchedKey != null) { matches.Add(new FindNextLineMatch { Key = matchedKey, LineIndex = lineIndex, WordIndex = wordIndex, OriginalKey = cleanedWord, Distance = distance }); matchedKey = null; break; } wordIndex++; } lineIndex++; } foreach (var match in matches) { if (splittedReversedLines.Count > match.LineIndex + 1) { var line = splittedReversedLines[match.LineIndex + 1]; if (line.Count > match.WordIndex) { var valueString = line[match.WordIndex]; if (validateValue == null || validateValue(valueString)) { if (forceWhiteList && !valueWhiteList.Any(x => x.Value == valueString)) { // Skip not validated match } else { // Add as validated value if (validateValue != null) { valueWhiteList.Add(new ExtractionValue { Value = valueString }); } var value = new ExtractionResult { KeyDistance = match.Distance, Key = matchedKey, Value = valueWhiteList.FirstOrDefault(x => x.Value == valueString), OriginalValue = valueString, CleanedKey = match.OriginalKey }; value.ValueMatched = value.Value != null; values.Add(value); } } } } } // Return most similar value return(values.OrderByDescending(x => x.ValueMatched).ThenBy(x => x.KeyDistance).FirstOrDefault()); }