private string OcrViaTesseract(Bitmap bitmap, int index) { if (bitmap == null) return string.Empty; if (_ocrFixEngine == null) comboBoxDictionaries_SelectedIndexChanged(null, null); int badWords = 0; string textWithOutFixes; if (!string.IsNullOrEmpty(_tesseractAsyncStrings[index])) { textWithOutFixes = _tesseractAsyncStrings[index]; } else { if (_tesseractAsyncIndex <= index) _tesseractAsyncIndex = index + 10; textWithOutFixes = Tesseract3DoOcrViaExe(bitmap, _languageId, "-psm 6"); // 6 = Assume a single uniform block of text. } if ((!textWithOutFixes.Contains(Environment.NewLine) || Utilities.CountTagInText("\n", textWithOutFixes) > 2) && (textWithOutFixes.Length < 17 || bitmap.Height < 50)) { string psm = Tesseract3DoOcrViaExe(bitmap, _languageId, "-psm 7"); // 7 = Treat the image as a single text line. if (textWithOutFixes != psm) { if (textWithOutFixes.Trim().Length == 0) { textWithOutFixes = psm; } else if (psm.Length > textWithOutFixes.Length) { if (!psm.Contains("9") && textWithOutFixes.Contains("9") || !psm.Contains("6") && textWithOutFixes.Contains("6") || !psm.Contains("5") && textWithOutFixes.Contains("5") || !psm.Contains("3") && textWithOutFixes.Contains("3") || !psm.Contains("1") && textWithOutFixes.Contains("1") || !psm.Contains("$") && textWithOutFixes.Contains("$") || !psm.Contains("•") && textWithOutFixes.Contains("•") || !psm.Contains("Y") && textWithOutFixes.Contains("Y") || !psm.Contains("'") && textWithOutFixes.Contains("'") || !psm.Contains("€") && textWithOutFixes.Contains("€")) textWithOutFixes = psm; } else if (psm.Length == textWithOutFixes.Length && (!psm.Contains("0") && textWithOutFixes.Contains("0") || // these chars are often mistaken !psm.Contains("9") && textWithOutFixes.Contains("9") || !psm.Contains("8") && textWithOutFixes.Contains("8") || !psm.Contains("5") && textWithOutFixes.Contains("5") || !psm.Contains("3") && textWithOutFixes.Contains("3") || !psm.Contains("1") && textWithOutFixes.Contains("1") || !psm.Contains("$") && textWithOutFixes.Contains("$") || !psm.Contains("€") && textWithOutFixes.Contains("€") || !psm.Contains("•") && textWithOutFixes.Contains("•") || !psm.Contains("Y") && textWithOutFixes.Contains("Y") || !psm.Contains("'") && textWithOutFixes.Contains("'") || !psm.Contains("/") && textWithOutFixes.Contains("/") || !psm.Contains("(") && textWithOutFixes.Contains("(") || !psm.Contains(")") && textWithOutFixes.Contains(")") || !psm.Contains("_") && textWithOutFixes.Contains("_"))) { textWithOutFixes = psm; } else if (psm.Length == textWithOutFixes.Length && psm.EndsWith(".") && !textWithOutFixes.EndsWith(".")) { textWithOutFixes = psm; } } } if (!checkBoxTesseractItalicsOn.Checked) textWithOutFixes = textWithOutFixes.Replace("<i>", string.Empty).Replace("</i>", string.Empty); // Sometimes Tesseract has problems with small fonts - it helps to make the image larger if (textWithOutFixes.Replace("<i>", string.Empty).Replace("</i>", string.Empty).Replace("@", string.Empty).Replace("%", string.Empty).Replace("|", string.Empty).Trim().Length < 3 || Utilities.CountTagInText("\n", textWithOutFixes) > 2) { string rs = TesseractResizeAndRetry(bitmap); textWithOutFixes = rs; if (!checkBoxTesseractItalicsOn.Checked) textWithOutFixes = textWithOutFixes.Replace("<i>", string.Empty).Replace("</i>", string.Empty); } // fix italics textWithOutFixes = FixItalics(textWithOutFixes); int numberOfWords = textWithOutFixes.ToString().Split((" " + Environment.NewLine).ToCharArray(), StringSplitOptions.RemoveEmptyEntries).Length; string line = textWithOutFixes.ToString().Trim(); if (_ocrFixEngine.IsDictionaryLoaded) { if (checkBoxAutoFixCommonErrors.Checked) line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked); int correctWords; int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords); int oldCorrectWords = correctWords; if (wordsNotFound > 0 || correctWords == 0) { List<string> oldUnkownWords = new List<string>(); oldUnkownWords.AddRange(_ocrFixEngine.UnknownWordsFound); _ocrFixEngine.UnknownWordsFound.Clear(); string newUnfixedText = TesseractResizeAndRetry(bitmap); string newText = _ocrFixEngine.FixOcrErrors(newUnfixedText, index, _lastLine, true, checkBoxGuessUnknownWords.Checked); int newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText, out correctWords); if (wordsNotFound == 1 && newWordsNotFound == 1 && newUnfixedText.EndsWith("!!") && textWithOutFixes.EndsWith("u") && newText.Length > 1) { _ocrFixEngine.UnknownWordsFound.Clear(); newText = textWithOutFixes.Substring(0, textWithOutFixes.Length - 1) + "!!"; newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText, out correctWords); } else if ((!newText.Contains("9") || textWithOutFixes.Contains("9")) && (!newText.Replace("</i>", string.Empty).Contains("/") || textWithOutFixes.Replace("</i>", string.Empty).Contains("/")) && newUnfixedText.Trim().Length > 0 && newWordsNotFound < wordsNotFound || (newWordsNotFound == wordsNotFound && newText.EndsWith("!") && textWithOutFixes.EndsWith("l"))) { wordsNotFound = newWordsNotFound; if (textWithOutFixes.Length > 3 && textWithOutFixes.EndsWith("...") && !newText.EndsWith(".") && !newText.EndsWith(",") && !newText.EndsWith("!") && !newText.EndsWith("?") && !newText.EndsWith("</i>")) newText = newText.TrimEnd() + "..."; else if (textWithOutFixes.Length > 0 && textWithOutFixes.EndsWith(".") && !newText.EndsWith(".") && !newText.EndsWith(",") && !newText.EndsWith("!") && !newText.EndsWith("?") && !newText.EndsWith("</i>")) newText = newText.TrimEnd() + "."; else if (textWithOutFixes.Length > 0 && textWithOutFixes.EndsWith("?") && !newText.EndsWith(".") && !newText.EndsWith(",") && !newText.EndsWith("!") && !newText.EndsWith("?") && !newText.EndsWith("</i>")) newText = newText.TrimEnd() + "?"; textWithOutFixes = newUnfixedText; line = FixItalics(newText); } else if (correctWords > oldCorrectWords + 1 || (correctWords > oldCorrectWords && !textWithOutFixes.Contains(" "))) { wordsNotFound = newWordsNotFound; textWithOutFixes = newUnfixedText; line = newText; } else { _ocrFixEngine.UnknownWordsFound.Clear(); _ocrFixEngine.UnknownWordsFound.AddRange(oldUnkownWords); } } if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && textWithOutFixes.ToString().Replace("~", string.Empty).Trim().Length < 2) { if (_bluRaySubtitles != null && !line.Contains("<i>")) { _ocrFixEngine.AutoGuessesUsed.Clear(); _ocrFixEngine.UnknownWordsFound.Clear(); // which is best - normal image or one color image? var nbmp = new NikseBitmap(bitmap); nbmp.MakeOneColor(Color.White); Bitmap oneColorBitmap = nbmp.GetBitmap(); string oneColorText = Tesseract3DoOcrViaExe(oneColorBitmap, _languageId, "-psm 6"); // 6 = Assume a single uniform block of text. oneColorBitmap.Dispose(); nbmp = null; if (oneColorText.Length > 1 && !oneColorText.Contains("CD") && (!oneColorText.Contains("0") || line.Contains("0")) && (!oneColorText.Contains("2") || line.Contains("2")) && (!oneColorText.Contains("3") || line.Contains("4")) && (!oneColorText.Contains("5") || line.Contains("5")) && (!oneColorText.Contains("9") || line.Contains("9")) && (!oneColorText.Contains("•") || line.Contains("•")) && (!oneColorText.Contains(")") || line.Contains(")")) && Utilities.CountTagInText(oneColorText, "(") < 2 && Utilities.CountTagInText(oneColorText, ")") < 2 && Utilities.CountTagInText(oneColorText, Environment.NewLine) < 3) { int modiCorrectWords; int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(oneColorText, out modiCorrectWords); string modiTextOcrFixed = oneColorText; if (checkBoxAutoFixCommonErrors.Checked) modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(oneColorText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked); int modiOcrCorrectedCorrectWords; int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out modiOcrCorrectedCorrectWords); if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound) { oneColorText = modiTextOcrFixed; modiWordsNotFound = modiOcrCorrectedWordsNotFound; modiCorrectWords = modiOcrCorrectedCorrectWords; } if (modiWordsNotFound < wordsNotFound || (textWithOutFixes.Length == 1 && modiWordsNotFound == 0)) { line = FixItalics(oneColorText); // use one-color text wordsNotFound = modiWordsNotFound; correctWords = modiCorrectWords; if (checkBoxAutoFixCommonErrors.Checked) line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked); } else if (wordsNotFound == modiWordsNotFound && oneColorText.EndsWith("!") && (line.EndsWith("l") || line.EndsWith("fl"))) { line = FixItalics(oneColorText); wordsNotFound = modiWordsNotFound; correctWords = modiCorrectWords; if (checkBoxAutoFixCommonErrors.Checked) line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked); } } } } if (checkBoxTesseractItalicsOn.Checked) { if (line.Contains("<i>") || wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && textWithOutFixes.ToString().Replace("~", string.Empty).Trim().Length < 2) { _ocrFixEngine.AutoGuessesUsed.Clear(); _ocrFixEngine.UnknownWordsFound.Clear(); // which is best - normal image or de-italic'ed? We find out here var unItalicedBmp = UnItalic(bitmap, _unItalicFactor); string unItalicText = Tesseract3DoOcrViaExe(unItalicedBmp, _languageId, "-psm 6"); // 6 = Assume a single uniform block of text. unItalicedBmp.Dispose(); if (unItalicText.Length > 1) { int modiCorrectWords; int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(unItalicText, out modiCorrectWords); string modiTextOcrFixed = unItalicText; if (checkBoxAutoFixCommonErrors.Checked) modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(unItalicText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked); int modiOcrCorrectedCorrectWords; int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out modiOcrCorrectedCorrectWords); if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound) { unItalicText = modiTextOcrFixed; modiWordsNotFound = modiOcrCorrectedWordsNotFound; modiCorrectWords = modiOcrCorrectedCorrectWords; } bool ok = modiWordsNotFound < wordsNotFound || (textWithOutFixes.Length == 1 && modiWordsNotFound == 0); if (!ok) ok = wordsNotFound == modiWordsNotFound && unItalicText.EndsWith("!") && (line.EndsWith("l") || line.EndsWith("fl")); if (!ok) ok = wordsNotFound == modiWordsNotFound && line.StartsWith("<i>") && line.EndsWith("</i>"); if (ok && Utilities.CountTagInText(unItalicText, "/") > Utilities.CountTagInText(line, "/") + 1) ok = false; if (ok && Utilities.CountTagInText(unItalicText, "\\") > Utilities.CountTagInText(line, "\\")) ok = false; if (ok && Utilities.CountTagInText(unItalicText, ")") > Utilities.CountTagInText(line, ")") + 1) ok = false; if (ok && Utilities.CountTagInText(unItalicText, "(") > Utilities.CountTagInText(line, "(") + 1) ok = false; if (ok && Utilities.CountTagInText(unItalicText, "$") > Utilities.CountTagInText(line, "$") + 1) ok = false; if (ok && Utilities.CountTagInText(unItalicText, "€") > Utilities.CountTagInText(line, "€") + 1) ok = false; if (ok && Utilities.CountTagInText(unItalicText, "•") > Utilities.CountTagInText(line, "•")) ok = false; if (ok) { wordsNotFound = modiWordsNotFound; correctWords = modiCorrectWords; line = line.Replace("<i>", string.Empty).Replace("</i>", string.Empty).Trim(); if (line.Length > 7 && unItalicText.Length > 7 && unItalicText.StartsWith("I ") && line.StartsWith(unItalicText.Remove(0, 2).Substring(0, 4))) unItalicText = unItalicText.Remove(0, 2); if (checkBoxTesseractMusicOn.Checked) { if ((line.StartsWith("J' ") || line.StartsWith("J“ ") || line.StartsWith("J* ") || line.StartsWith("♪ ")) && unItalicText.Length > 3 && unItalicText.Replace("<i>", string.Empty).Replace("</i>", string.Empty).Substring(1, 2) == "' ") { unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart(); } if ((line.StartsWith("J' ") || line.StartsWith("J“ ") || line.StartsWith("J* ") || line.StartsWith("♪ ")) && unItalicText.Length > 3 && unItalicText.Replace("<i>", string.Empty).Replace("</i>", string.Empty).Substring(1, 1) == " ") { bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>"); unItalicText = Utilities.RemoveHtmlTags(unItalicText); unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart(); if (ita) unItalicText = "<i>" + unItalicText + "</i>"; } if ((line.StartsWith("J' ") || line.StartsWith("J“ ") || line.StartsWith("J* ") || line.StartsWith("♪ ")) && unItalicText.Length > 3 && unItalicText.Replace("<i>", string.Empty).Replace("</i>", string.Empty).Substring(2, 1) == " ") { bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>"); unItalicText = Utilities.RemoveHtmlTags(unItalicText); unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart(); if (ita) unItalicText = "<i>" + unItalicText + "</i>"; } if (unItalicText.StartsWith("J'") && (line.StartsWith("♪") || textWithOutFixes.StartsWith("♪") || textWithOutFixes.StartsWith("<i>♪") || unItalicText.EndsWith("♪"))) { bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>"); unItalicText = Utilities.RemoveHtmlTags(unItalicText); unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart(); if (ita) unItalicText = "<i>" + unItalicText + "</i>"; } if ((line.StartsWith("J` ") || line.StartsWith("J“ ") || line.StartsWith("J' ") || line.StartsWith("J* ")) && unItalicText.StartsWith("S ")) { bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>"); unItalicText = Utilities.RemoveHtmlTags(unItalicText); unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart(); if (ita) unItalicText = "<i>" + unItalicText + "</i>"; } if ((line.StartsWith("J` ") || line.StartsWith("J“ ") || line.StartsWith("J' ") || line.StartsWith("J* ")) && unItalicText.StartsWith("<i>S</i> ")) { bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>"); unItalicText = Utilities.RemoveHtmlTags(unItalicText); unItalicText = "♪ " + unItalicText.Remove(0, 8).TrimStart(); if (ita) unItalicText = "<i>" + unItalicText + "</i>"; } if (unItalicText.StartsWith(";'") && (line.StartsWith("♪") || textWithOutFixes.StartsWith("♪") || textWithOutFixes.StartsWith("<i>♪") || unItalicText.EndsWith("♪"))) { bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>"); unItalicText = Utilities.RemoveHtmlTags(unItalicText); unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart(); if (ita) unItalicText = "<i>" + unItalicText + "</i>"; } if (unItalicText.StartsWith(",{*") && (line.StartsWith("♪") || textWithOutFixes.StartsWith("♪") || textWithOutFixes.StartsWith("<i>♪") || unItalicText.EndsWith("♪"))) { bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>"); unItalicText = Utilities.RemoveHtmlTags(unItalicText); unItalicText = "♪ " + unItalicText.Remove(0, 3).TrimStart(); if (ita) unItalicText = "<i>" + unItalicText + "</i>"; } if (unItalicText.EndsWith("J'") && (line.EndsWith("♪") || textWithOutFixes.EndsWith("♪") || textWithOutFixes.EndsWith("♪</i>") || unItalicText.StartsWith("♪"))) { bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>"); unItalicText = Utilities.RemoveHtmlTags(unItalicText); unItalicText = unItalicText.Remove(unItalicText.Length - 3, 2).TrimEnd() + " ♪"; if (ita) unItalicText = "<i>" + unItalicText + "</i>"; } } if (unItalicText.StartsWith("[") && !line.StartsWith("[")) { unItalicText = unItalicText.Remove(0, 1); if (unItalicText.EndsWith("]")) unItalicText = unItalicText.TrimEnd(']'); } if (unItalicText.StartsWith("{") && !line.StartsWith("{")) { unItalicText = unItalicText.Remove(0, 1); if (unItalicText.EndsWith("}")) unItalicText = unItalicText.TrimEnd('}'); } if (unItalicText.EndsWith("}") && !line.EndsWith("}")) unItalicText = unItalicText.TrimEnd('}'); if (line.EndsWith("...") && unItalicText.EndsWith("”!")) unItalicText = unItalicText.TrimEnd('!').TrimEnd('”') + "."; if (line.EndsWith("...") && unItalicText.EndsWith("\"!")) unItalicText = unItalicText.TrimEnd('!').TrimEnd('"') + "."; if (line.EndsWith(".") && !unItalicText.EndsWith(".") && !unItalicText.EndsWith(".</i>")) { string post = string.Empty; if (unItalicText.EndsWith("</i>")) { post = "</i>"; unItalicText = unItalicText.Remove(unItalicText.Length - 4); } if (unItalicText.EndsWith("'") && !line.EndsWith("'.")) unItalicText = unItalicText.TrimEnd('\''); unItalicText += "." + post; } if (unItalicText.EndsWith(".") && !unItalicText.EndsWith("...") && !unItalicText.EndsWith("...</i>") && line.EndsWith("...")) { string post = string.Empty; if (unItalicText.EndsWith("</i>")) { post = "</i>"; unItalicText = unItalicText.Remove(unItalicText.Length - 4); } unItalicText += ".." + post; } if (unItalicText.EndsWith("..") && !unItalicText.EndsWith("...") && !unItalicText.EndsWith("...</i>") && line.EndsWith("...")) { string post = string.Empty; if (unItalicText.EndsWith("</i>")) { post = "</i>"; unItalicText = unItalicText.Remove(unItalicText.Length - 4); } unItalicText += "." + post; } if (line.EndsWith("!") && !unItalicText.EndsWith("!") && !unItalicText.EndsWith("!</i>")) { if (unItalicText.EndsWith("!'")) { unItalicText = unItalicText.TrimEnd('\''); } else { if (unItalicText.EndsWith("l</i>") && _ocrFixEngine != null) { string w = unItalicText.Substring(0, unItalicText.Length - 4); int wIdx = w.Length - 1; while (wIdx >= 0 && !(" .,!?<>:;'-$@£()[]<>/\"".Contains(w[wIdx].ToString()))) { wIdx--; } if (wIdx + 1 < w.Length && unItalicText.Length > 5) { w = w.Substring(wIdx + 1); if (!_ocrFixEngine.DoSpell(w)) unItalicText = unItalicText.Remove(unItalicText.Length - 5, 1); } unItalicText = unItalicText.Insert(unItalicText.Length - 4, "!"); } else if (unItalicText.EndsWith("l") && _ocrFixEngine != null) { string w = unItalicText; int wIdx = w.Length - 1; while (wIdx >= 0 && !(" .,!?<>:;'-$@£()[]<>/\"".Contains(w[wIdx].ToString()))) { wIdx--; } if (wIdx + 1 < w.Length && unItalicText.Length > 5) { w = w.Substring(wIdx + 1); if (!_ocrFixEngine.DoSpell(w)) unItalicText = unItalicText.Remove(unItalicText.Length - 1, 1); } unItalicText += "!"; } else { unItalicText += "!"; } } } if (line.EndsWith("?") && !unItalicText.EndsWith("?") && !unItalicText.EndsWith("?</i>")) { if (unItalicText.EndsWith("?'")) unItalicText = unItalicText.TrimEnd('\''); else unItalicText += "?"; } line = unItalicText.Replace("<i>", string.Empty).Replace("</i>", string.Empty); if (checkBoxAutoFixCommonErrors.Checked) { if (line.Contains("'.") && !textWithOutFixes.Contains("'.") && textWithOutFixes.Contains(":") && !line.EndsWith("'.") && Configuration.Settings.Tools.OcrFixUseHardcodedRules) { line = line.Replace("'.", ":"); } line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked); } line = "<i>" + line + "</i>"; } else { unItalicText = unItalicText.Replace("</i>", string.Empty); if (line.EndsWith("</i>") && unItalicText.EndsWith(".")) { line = line.Remove(line.Length - 4, 4); if (line.EndsWith("-")) line = line.TrimEnd('-') + "."; if (Utilities.AllLetters.Contains(line.Substring(line.Length - 1))) line += "."; line += "</i>"; } } } } } if (checkBoxTesseractMusicOn.Checked) { if (line == "[J'J'J~]" || line == "[J'J'J']") line = "[ ♪ ♪ ♪ ]"; line = line.Replace(" J' ", " ♪ "); if (line.StartsWith("J'")) { line = "♪ " + line.Remove(0, 2).TrimStart(); } if (line.StartsWith("<i>J'")) { line = "<i>♪ " + line.Remove(0, 5).TrimStart(); } if (line.StartsWith("[J'")) { line = "[♪ " + line.Remove(0, 3).TrimStart(); } if (line.StartsWith("<i>[J'")) { line = "<i>[♪ " + line.Remove(0, 6).TrimStart(); } if (line.EndsWith("J'")) { line = line.Remove(line.Length - 2, 2).TrimEnd() + " ♪"; } if (line.EndsWith("J'</i>")) { line = line.Remove(line.Length - 6, 6).TrimEnd() + " ♪</i>"; } if (line.Contains(Environment.NewLine + "J'")) { line = line.Replace(Environment.NewLine + "J'", Environment.NewLine + "♪ "); line = line.Replace(" ", " "); } if (line.Contains("J'" + Environment.NewLine)) { line = line.Replace("J'" + Environment.NewLine, " ♪" + Environment.NewLine); line = line.Replace(" ", " "); } } if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && textWithOutFixes.ToString().Replace("~", string.Empty).Trim().Length < 2) { _ocrFixEngine.AutoGuessesUsed.Clear(); _ocrFixEngine.UnknownWordsFound.Clear(); if (_modiEnabled && checkBoxUseModiInTesseractForUnknownWords.Checked) { // which is best - modi or tesseract - we find out here string modiText = CallModi(index); if (modiText.Length == 0) modiText = CallModi(index); // retry... strange MODI if (modiText.Length == 0) modiText = CallModi(index); // retry... strange MODI if (modiText.Length > 1 && !modiText.Contains("CD") && (!modiText.Contains("0") || line.Contains("0")) && (!modiText.Contains("2") || line.Contains("2")) && (!modiText.Contains("3") || line.Contains("4")) && (!modiText.Contains("5") || line.Contains("5")) && (!modiText.Contains("9") || line.Contains("9")) && (!modiText.Contains("•") || line.Contains("•")) && (!modiText.Contains(")") || line.Contains(")")) && Utilities.CountTagInText(modiText, "(") < 2 && Utilities.CountTagInText(modiText, ")") < 2 && Utilities.CountTagInText(modiText, Environment.NewLine) < 3) { int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiText, out correctWords); //if (modiWordsNotFound > 0) { string modiTextOcrFixed = modiText; if (checkBoxAutoFixCommonErrors.Checked) modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(modiText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked); int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out correctWords); if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound) modiText = modiTextOcrFixed; } if (modiWordsNotFound < wordsNotFound || (textWithOutFixes.Length == 1 && modiWordsNotFound == 0)) line = modiText; // use the modi ocr'ed text else if (wordsNotFound == modiWordsNotFound && modiText.EndsWith("!") && (line.EndsWith("l") || line.EndsWith("fl"))) line = modiText; } // take the best option - before ocr fixing, which we do again to save suggestions and prompt for user input line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, checkBoxGuessUnknownWords.Checked); } else { // fix some error manually (modi not available) line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, checkBoxGuessUnknownWords.Checked); } } if (_ocrFixEngine.Abort) { ButtonStopClick(null, null); _ocrFixEngine.Abort = false; return string.Empty; } //check tesseract... find some otherway to do this... //string tmp = Utilities.RemoveHtmlTags(line).Trim(); //if (!tmp.Trim().EndsWith("...")) //{ // tmp = tmp.TrimEnd('.').TrimEnd(); // if (tmp.Length > 2 && Utilities.LowercaseLetters.Contains(tmp.Substring(tmp.Length - 1, 1))) // { // if (_nocrChars == null) // _nocrChars = LoadNOcrForTesseract("Nikse.SubtitleEdit.Resources.nOCR_TesseractHelper.xml.zip"); // string text = Utilities.RemoveHtmlTags(NocrFastCheck(bitmap).TrimEnd()); // string post = string.Empty; // if (line.EndsWith("</i>")) // { // post = "</i>"; // line = line.Remove(line.Length - 4, 4).Trim(); // } // if (text.EndsWith(".")) // { // line = line.TrimEnd('.').Trim(); // while (text.EndsWith(".") || text.EndsWith(" ")) // { // line += text.Substring(text.Length - 1).Trim(); // text = text.Remove(text.Length - 1, 1); // } // } // else if (text.EndsWith("l") && text.EndsWith("!") && !text.EndsWith("l!")) // { // line = line.Remove(line.Length - 1, 1) + "!"; // } // line += post; // } //} // Log used word guesses (via word replace list) foreach (string guess in _ocrFixEngine.AutoGuessesUsed) listBoxLogSuggestions.Items.Add(guess); _ocrFixEngine.AutoGuessesUsed.Clear(); // Log unkown words guess (found via spelling dictionaries) LogUnknownWords(); if (wordsNotFound >= 3) subtitleListView1.SetBackgroundColor(index, Color.Red); if (wordsNotFound == 2) subtitleListView1.SetBackgroundColor(index, Color.Orange); else if (wordsNotFound == 1 || line.Length == 1 || line.Contains("_") || HasSingleLetters(line)) subtitleListView1.SetBackgroundColor(index, Color.Yellow); else if (line.Trim().Length == 0) subtitleListView1.SetBackgroundColor(index, Color.Orange); else subtitleListView1.SetBackgroundColor(index, Color.LightGreen); } else { // no dictionary :( if (checkBoxAutoFixCommonErrors.Checked) line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked); if (badWords >= numberOfWords) subtitleListView1.SetBackgroundColor(index, Color.Red); else if (badWords >= numberOfWords / 2) subtitleListView1.SetBackgroundColor(index, Color.Orange); else if (badWords > 0 || line.Contains("_") || HasSingleLetters(line)) subtitleListView1.SetBackgroundColor(index, Color.Yellow); else if (line.Replace("<i>", string.Empty).Replace("</i>", string.Empty).Trim().Length == 0) subtitleListView1.SetBackgroundColor(index, Color.Orange); else subtitleListView1.SetBackgroundColor(index, Color.LightGreen); } if (textWithOutFixes.ToString().Trim() != line.Trim()) { _tesseractOcrAutoFixes++; labelFixesMade.Text = string.Format(" - {0}", _tesseractOcrAutoFixes); LogOcrFix(index, textWithOutFixes.ToString(), line); } if (_vobSubMergedPackist != null) bitmap.Dispose(); return line; }