public string FixUnknownWordsViaGuessOrPrompt(out int wordsNotFound, string line, int index, Bitmap bitmap, bool autoFix, bool promptForFixingErrors, bool log, AutoGuessLevel autoGuess) { var localIgnoreWords = new List<string>(); wordsNotFound = 0; if (promptForFixingErrors && line.Length == 1 && !IsWordKnownOrNumber(line, line)) { SpellCheckOcrTextResult res = SpellCheckOcrText(line, bitmap, line, localIgnoreWords); if (res.FixedWholeLine || res.Fixed) return res.Line; wordsNotFound++; return line; } if (_hunspell == null) return line; string tempLine = line; //foreach (string name in _namesEtcList) //{ // int start = tempLine.IndexOf(name); // if (start >= 0) // { // if (start == 0 || (Environment.NewLine + " ¡¿,.!?:;()[]{}+-$£\"”“#&%…—").Contains(tempLine[start - 1].ToString())) // { // int end = start + name.Length; // if (end >= tempLine.Length || (Environment.NewLine + " ¡¿,.!?:;()[]{}+-$£\"”“#&%…—").Contains(tempLine[end].ToString())) // tempLine = tempLine.Remove(start, name.Length); // } // } //} const string p = @" ¡¿,.!?:;()[]{}+-$£""”“#&%…—♪"; foreach (string name in _namesEtcMultiWordList) { int start = tempLine.FastIndexOf(name); if (start >= 0) { if (start == 0 || (Environment.NewLine + p).Contains(tempLine[start - 1])) { int end = start + name.Length; if (end >= tempLine.Length || (Environment.NewLine + p).Contains(tempLine[end])) tempLine = tempLine.Remove(start, name.Length); } } } string[] words = tempLine.Replace("</i>", string.Empty).Split((Environment.NewLine + " ¡¿,.!?:;()[]{}+-£\"”“#&%…—♪").ToCharArray(), StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < words.Length; i++) { string word = words[i].TrimStart('\''); string wordNotEndTrimmed = word; word = word.TrimEnd('\''); string wordNoItalics = HtmlUtil.RemoveOpenCloseTags(word, HtmlUtil.TagItalic); if (!IsWordKnownOrNumber(wordNoItalics, line) && !localIgnoreWords.Contains(wordNoItalics)) { bool correct = DoSpell(word); if (!correct) correct = DoSpell(word.Trim('\'')); if (!correct && word.Length > 3 && !word.EndsWith("ss") && !string.IsNullOrEmpty(_threeLetterIsoLanguageName) && (_threeLetterIsoLanguageName == "eng" || _threeLetterIsoLanguageName == "dan" || _threeLetterIsoLanguageName == "swe" || _threeLetterIsoLanguageName == "nld")) correct = DoSpell(word.TrimEnd('s')); if (!correct) correct = DoSpell(wordNoItalics); if (!correct && _userWordList.Contains(wordNoItalics)) correct = true; if (!correct && !line.Contains(word)) correct = true; // already fixed if (!correct && Configuration.Settings.Tools.SpellCheckEnglishAllowInQuoteAsIng && wordNotEndTrimmed.EndsWith('\'') && SpellCheckDictionaryName.StartsWith("en_") && word.EndsWith("in", StringComparison.OrdinalIgnoreCase)) { correct = DoSpell(word + "g"); } if (!correct) { //look for match via dash'ed word, e.g. sci-fi string dashedWord = GetDashedWordBefore(wordNoItalics, line, words, i); if (!string.IsNullOrEmpty(dashedWord)) { correct = IsWordKnownOrNumber(dashedWord, line); if (!correct) correct = DoSpell(dashedWord); } if (!correct) { dashedWord = GetDashedWordAfter(wordNoItalics, line, words, i); if (!string.IsNullOrEmpty(dashedWord)) { correct = IsWordKnownOrNumber(dashedWord, line); if (!correct) correct = DoSpell(dashedWord); } } } if (!correct && word.Contains('/') && !word.Contains("//")) { var slashedWords = word.Split(new[] { '/' }, StringSplitOptions.RemoveEmptyEntries); bool allSlashedCorrect = true; foreach (var slashedWord in slashedWords) { if (slashedWord.Length < 2) allSlashedCorrect = false; if (allSlashedCorrect && !(DoSpell(slashedWord) || IsWordKnownOrNumber(slashedWord, line))) allSlashedCorrect = false; } correct = allSlashedCorrect; } if (word.Length == 0) correct = true; if (!correct) { wordsNotFound++; if (log) { string nf = word; if (nf.StartsWith("<i>", StringComparison.Ordinal)) nf = nf.Remove(0, 3); UnknownWordsFound.Add(string.Format("#{0}: {1}", index + 1, nf)); } if (autoFix && autoGuess != AutoGuessLevel.None) { var guesses = new List<string>(); if (word.Length > 5 && autoGuess == AutoGuessLevel.Aggressive) { guesses = (List<string>)CreateGuessesFromLetters(word); if (word[0] == 'L') guesses.Add("I" + word.Substring(1)); if (word.Contains('$')) guesses.Add(word.Replace("$", "s")); string wordWithCasingChanged = GetWordWithDominatedCasing(word); if (DoSpell(word.ToLower())) guesses.Insert(0, wordWithCasingChanged); } else if (Configuration.Settings.Tools.OcrFixUseHardcodedRules) { if (word[0] == 'L') guesses.Add("I" + word.Substring(1)); if (word.Length > 2 && word[0] == 'I' && char.IsLower(word[1])) guesses.Add("l" + word.Substring(1)); if (i == 0) guesses.Add(word.Replace(@"\/", "V")); else guesses.Add(word.Replace(@"\/", "v")); guesses.Add(word.Replace("fi", "fi")); guesses.Add(word.Replace("fi", "fj")); guesses.Add(word.Replace("fl", "fl")); if (word.Contains('$')) guesses.Add(word.Replace("$", "s")); if (!word.EndsWith('€') && !word.StartsWith('€')) guesses.Add(word.Replace("€", "e")); guesses.Add(word.Replace("/", "l")); guesses.Add(word.Replace(")/", "y")); } foreach (string guess in guesses) { if (IsWordOrWordsCorrect(guess) && !guess.StartsWith("f ")) { string replacedLine = OcrFixReplaceList.ReplaceWord(line, word, guess); if (replacedLine != line) { if (log) AutoGuessesUsed.Add(string.Format("#{0}: {1} -> {2} in line via '{3}': {4}", index + 1, word, guess, "OCRFixReplaceList.xml", line.Replace(Environment.NewLine, " "))); //line = line.Remove(match.Index, match.Value.Length).Insert(match.Index, guess); line = replacedLine; wordsNotFound--; if (log && UnknownWordsFound.Count > 0) UnknownWordsFound.RemoveAt(UnknownWordsFound.Count - 1); correct = true; break; } } } } if (!correct && promptForFixingErrors) { var suggestions = new List<string>(); if ((word == "Lt's" || word == "Lt'S") && SpellCheckDictionaryName.StartsWith("en_", StringComparison.Ordinal)) { suggestions.Add("It's"); } else { if (word.ToUpper() != "LT'S" && word.ToUpper() != "SOX'S") // TODO: Get fixed nhunspell suggestions = DoSuggest(word); // 0.9.6 fails on "Lt'S" } if (word.StartsWith("<i>")) word = word.Remove(0, 3); if (word.EndsWith("</i>")) word = word.Remove(word.Length - 4, 4); SpellCheckOcrTextResult res = SpellCheckOcrText(line, bitmap, word, suggestions); if (res.FixedWholeLine) { return res.Line; } if (res.Fixed) { localIgnoreWords.Add(word); line = res.Line; wordsNotFound--; } } } } } return line; }
public string FixOcrErrors(string text, int index, string lastLine, bool logSuggestions, AutoGuessLevel autoGuess) { var sb = new StringBuilder(); var word = new StringBuilder(); if (Configuration.Settings.Tools.OcrFixUseHardcodedRules) { text = text.Replace("fi", "fi"); // fb01 text = text.Replace("fl", "fl"); // fb02 text = text.Replace('ν', 'v'); // NOTE: first 'v' is a special unicode character!!!! } text = ReplaceWordsBeforeLineFixes(text); text = FixCommenOcrLineErrors(text, lastLine); string lastWord = null; for (int i = 0; i < text.Length; i++) { if (" ¡¿,.!?:;()[]{}+-£\"#&%\r\n".Contains(text[i])) // removed $ { if (word.Length > 0) { string fixedWord; if (lastWord != null && lastWord.Contains("COLOR=", StringComparison.OrdinalIgnoreCase)) { fixedWord = word.ToString(); } else { bool doFixWord = true; if (word.Length == 1 && sb.Length > 1 && sb.EndsWith('-')) doFixWord = false; if (doFixWord) fixedWord = _ocrFixReplaceList.FixCommonWordErrors(word.ToString()); else fixedWord = word.ToString(); } sb.Append(fixedWord); lastWord = fixedWord; word.Clear(); } sb.Append(text[i]); } else { word.Append(text[i]); } } if (word.Length > 0) // last word { string fixedWord; bool doFixWord = true; if (word.Length == 1 && sb.Length > 1 && sb.EndsWith('-')) doFixWord = false; if (doFixWord) fixedWord = _ocrFixReplaceList.FixCommonWordErrors(word.ToString()); else fixedWord = word.ToString(); sb.Append(fixedWord); } text = FixCommenOcrLineErrors(sb.ToString(), lastLine); int wordsNotFound; text = FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, text, index, null, true, false, logSuggestions, autoGuess); if (Configuration.Settings.Tools.OcrFixUseHardcodedRules) { text = FixLowercaseIToUppercaseI(text, lastLine); if (SpellCheckDictionaryName.StartsWith("en_", StringComparison.Ordinal) || _threeLetterIsoLanguageName == "eng") { string oldText = text; text = FixCommonErrors.FixAloneLowercaseIToUppercaseLine(RegexAloneI, oldText, text, 'i'); text = FixCommonErrors.FixAloneLowercaseIToUppercaseLine(RegexAloneIasL, oldText, text, 'l'); } else if (_threeLetterIsoLanguageName == "fra") { text = FixFrenchLApostrophe(text, " I'", lastLine); text = FixFrenchLApostrophe(text, " L'", lastLine); text = FixFrenchLApostrophe(text, " l'", lastLine); text = FixFrenchLApostrophe(text, " I’", lastLine); text = FixFrenchLApostrophe(text, " L’", lastLine); text = FixFrenchLApostrophe(text, " l’", lastLine); } text = RemoveSpaceBetweenNumbers(text); } return text; }