private string OcrViaTesseract(Bitmap bitmap, int index) { if (bitmap == null) return string.Empty; if (_ocrFixEngine == null) comboBoxDictionaries_SelectedIndexChanged(null, null); const int badWords = 0; string textWithOutFixes; if (_tesseractAsyncStrings != null && !string.IsNullOrEmpty(_tesseractAsyncStrings[index])) { textWithOutFixes = _tesseractAsyncStrings[index]; } else { if (_tesseractAsyncIndex <= index) _tesseractAsyncIndex = index + 10; textWithOutFixes = Tesseract3DoOcrViaExe(bitmap, _languageId, "-psm 6"); // 6 = Assume a single uniform block of text. } if ((!textWithOutFixes.Contains(Environment.NewLine) || Utilities.CountTagInText(textWithOutFixes, '\n') > 2) && (textWithOutFixes.Length < 17 || bitmap.Height < 50)) { string psm = Tesseract3DoOcrViaExe(bitmap, _languageId, "-psm 7"); // 7 = Treat the image as a single text line. if (textWithOutFixes != psm) { if (string.IsNullOrWhiteSpace(textWithOutFixes)) { textWithOutFixes = psm; } else if (psm.Length > textWithOutFixes.Length) { if (!psm.Contains('9') && textWithOutFixes.Contains('9') || !psm.Contains('6') && textWithOutFixes.Contains('6') || !psm.Contains('5') && textWithOutFixes.Contains('5') || !psm.Contains('3') && textWithOutFixes.Contains('3') || !psm.Contains('1') && textWithOutFixes.Contains('1') || !psm.Contains('$') && textWithOutFixes.Contains('$') || !psm.Contains('•') && textWithOutFixes.Contains('•') || !psm.Contains('Y') && textWithOutFixes.Contains('Y') || !psm.Contains('\'') && textWithOutFixes.Contains('\'') || !psm.Contains('€') && textWithOutFixes.Contains('€')) { textWithOutFixes = psm; } else if (_ocrFixEngine != null && !psm.Contains('$') && !psm.Contains('•') && !psm.Contains('€')) { int correctWordsNoFixes; int wordsNotFoundNoFixes = _ocrFixEngine.CountUnknownWordsViaDictionary(textWithOutFixes, out correctWordsNoFixes); int correctWordsPsm7; int wordsNotFoundPsm7 = _ocrFixEngine.CountUnknownWordsViaDictionary(psm, out correctWordsPsm7); if (wordsNotFoundPsm7 <= wordsNotFoundNoFixes && correctWordsPsm7 > correctWordsNoFixes) { textWithOutFixes = psm; } } } else if (psm.Length == textWithOutFixes.Length && (!psm.Contains('0') && textWithOutFixes.Contains('0') || // these chars are often mistaken !psm.Contains('9') && textWithOutFixes.Contains('9') || !psm.Contains('8') && textWithOutFixes.Contains('8') || !psm.Contains('5') && textWithOutFixes.Contains('5') || !psm.Contains('3') && textWithOutFixes.Contains('3') || !psm.Contains('1') && textWithOutFixes.Contains('1') || !psm.Contains('$') && textWithOutFixes.Contains('$') || !psm.Contains('€') && textWithOutFixes.Contains('€') || !psm.Contains('•') && textWithOutFixes.Contains('•') || !psm.Contains('Y') && textWithOutFixes.Contains('Y') || !psm.Contains('\'') && textWithOutFixes.Contains('\'') || !psm.Contains('/') && textWithOutFixes.Contains('/') || !psm.Contains('(') && textWithOutFixes.Contains('(') || !psm.Contains(')') && textWithOutFixes.Contains(')') || !psm.Contains('_') && textWithOutFixes.Contains('_'))) { textWithOutFixes = psm; } else if (psm.Length == textWithOutFixes.Length && psm.EndsWith('.') && !textWithOutFixes.EndsWith('.')) { textWithOutFixes = psm; } } } if (!checkBoxTesseractItalicsOn.Checked) textWithOutFixes = HtmlUtil.RemoveOpenCloseTags(textWithOutFixes, HtmlUtil.TagItalic); // Sometimes Tesseract has problems with small fonts - it helps to make the image larger if (HtmlUtil.RemoveOpenCloseTags(textWithOutFixes, HtmlUtil.TagItalic).Replace("@", string.Empty).Replace("%", string.Empty).Replace("|", string.Empty).Trim().Length < 3 || Utilities.CountTagInText(textWithOutFixes, '\n') > 2) { string rs = TesseractResizeAndRetry(bitmap); textWithOutFixes = rs; if (!checkBoxTesseractItalicsOn.Checked) textWithOutFixes = HtmlUtil.RemoveOpenCloseTags(textWithOutFixes, HtmlUtil.TagItalic); } // fix italics textWithOutFixes = FixItalics(textWithOutFixes); int numberOfWords = textWithOutFixes.Split((" " + Environment.NewLine).ToCharArray(), StringSplitOptions.RemoveEmptyEntries).Length; string line = textWithOutFixes.Trim(); if (_ocrFixEngine.IsDictionaryLoaded) { if (checkBoxAutoFixCommonErrors.Checked) line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel()); int correctWords; int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords); int oldCorrectWords = correctWords; if (wordsNotFound > 0 || correctWords == 0) { List<string> oldUnkownWords = new List<string>(); oldUnkownWords.AddRange(_ocrFixEngine.UnknownWordsFound); _ocrFixEngine.UnknownWordsFound.Clear(); string newUnfixedText = TesseractResizeAndRetry(bitmap); string newText = _ocrFixEngine.FixOcrErrors(newUnfixedText, index, _lastLine, true, GetAutoGuessLevel()); int newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText, out correctWords); if (wordsNotFound == 1 && newWordsNotFound == 1 && newUnfixedText.EndsWith("!!") && textWithOutFixes.EndsWith('u') && newText.Length > 1) { _ocrFixEngine.UnknownWordsFound.Clear(); newText = textWithOutFixes.Substring(0, textWithOutFixes.Length - 1) + "!!"; newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText, out correctWords); } else if (correctWords >= oldCorrectWords && (!newText.Contains('9') || textWithOutFixes.Contains('9')) && (!newText.Replace("</i>", string.Empty).Contains('/') || textWithOutFixes.Replace("</i>", string.Empty).Contains('/')) && !string.IsNullOrWhiteSpace(newUnfixedText) && newWordsNotFound < wordsNotFound || (newWordsNotFound == wordsNotFound && newText.EndsWith('!') && textWithOutFixes.EndsWith('l'))) { wordsNotFound = newWordsNotFound; if (textWithOutFixes.Length > 3 && textWithOutFixes.EndsWith("...") && !newText.EndsWith('.') && !newText.EndsWith(',') && !newText.EndsWith('!') && !newText.EndsWith('?') && !newText.EndsWith("</i>")) newText = newText.TrimEnd() + "..."; else if (textWithOutFixes.Length > 0 && textWithOutFixes.EndsWith('.') && !newText.EndsWith('.') && !newText.EndsWith(',') && !newText.EndsWith('!') && !newText.EndsWith('?') && !newText.EndsWith("</i>")) newText = newText.TrimEnd() + "."; else if (textWithOutFixes.Length > 0 && textWithOutFixes.EndsWith('?') && !newText.EndsWith('.') && !newText.EndsWith(',') && !newText.EndsWith('!') && !newText.EndsWith('?') && !newText.EndsWith("</i>")) newText = newText.TrimEnd() + "?"; textWithOutFixes = newUnfixedText; line = FixItalics(newText); } else if (correctWords > oldCorrectWords + 1 || (correctWords > oldCorrectWords && !textWithOutFixes.Contains(' '))) { wordsNotFound = newWordsNotFound; textWithOutFixes = newUnfixedText; line = newText; } else { _ocrFixEngine.UnknownWordsFound.Clear(); _ocrFixEngine.UnknownWordsFound.AddRange(oldUnkownWords); } } if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && textWithOutFixes.Replace("~", string.Empty).Trim().Length < 2) { if (_bluRaySubtitles != null && !line.Contains("<i>")) { _ocrFixEngine.AutoGuessesUsed.Clear(); _ocrFixEngine.UnknownWordsFound.Clear(); // which is best - normal image or one color image? var nbmp = new NikseBitmap(bitmap); nbmp.MakeOneColor(Color.White); Bitmap oneColorBitmap = nbmp.GetBitmap(); string oneColorText = Tesseract3DoOcrViaExe(oneColorBitmap, _languageId, "-psm 6"); // 6 = Assume a single uniform block of text. oneColorBitmap.Dispose(); nbmp = null; if (oneColorText.Length > 1 && !oneColorText.Contains("CD") && (!oneColorText.Contains('0') || line.Contains('0')) && (!oneColorText.Contains('2') || line.Contains('2')) && (!oneColorText.Contains('3') || line.Contains('4')) && (!oneColorText.Contains('5') || line.Contains('5')) && (!oneColorText.Contains('9') || line.Contains('9')) && (!oneColorText.Contains('•') || line.Contains('•')) && (!oneColorText.Contains(')') || line.Contains(')')) && Utilities.CountTagInText(oneColorText, '(') < 2 && Utilities.CountTagInText(oneColorText, ')') < 2 && Utilities.GetNumberOfLines(oneColorText) < 4) { int modiCorrectWords; int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(oneColorText, out modiCorrectWords); string modiTextOcrFixed = oneColorText; if (checkBoxAutoFixCommonErrors.Checked) modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(oneColorText, index, _lastLine, false, GetAutoGuessLevel()); int modiOcrCorrectedCorrectWords; int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out modiOcrCorrectedCorrectWords); if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound) { oneColorText = modiTextOcrFixed; modiWordsNotFound = modiOcrCorrectedWordsNotFound; modiCorrectWords = modiOcrCorrectedCorrectWords; } if (modiWordsNotFound < wordsNotFound || (textWithOutFixes.Length == 1 && modiWordsNotFound == 0)) { line = FixItalics(oneColorText); // use one-color text wordsNotFound = modiWordsNotFound; correctWords = modiCorrectWords; if (checkBoxAutoFixCommonErrors.Checked) line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel()); } else if (wordsNotFound == modiWordsNotFound && oneColorText.EndsWith('!') && (line.EndsWith('l') || line.EndsWith('fl'))) { line = FixItalics(oneColorText); wordsNotFound = modiWordsNotFound; correctWords = modiCorrectWords; if (checkBoxAutoFixCommonErrors.Checked) line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel()); } } } } if (checkBoxTesseractItalicsOn.Checked) { if (line.Contains("<i>") || wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && textWithOutFixes.Replace("~", string.Empty).Trim().Length < 2) { _ocrFixEngine.AutoGuessesUsed.Clear(); _ocrFixEngine.UnknownWordsFound.Clear(); // which is best - normal image or de-italic'ed? We find out here var unItalicedBmp = UnItalic(bitmap, _unItalicFactor); string unItalicText = Tesseract3DoOcrViaExe(unItalicedBmp, _languageId, "-psm 6"); // 6 = Assume a single uniform block of text. unItalicedBmp.Dispose(); if (unItalicText.Length > 1) { int modiCorrectWords; int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(unItalicText, out modiCorrectWords); string modiTextOcrFixed = unItalicText; if (checkBoxAutoFixCommonErrors.Checked) modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(unItalicText, index, _lastLine, false, GetAutoGuessLevel()); int modiOcrCorrectedCorrectWords; int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out modiOcrCorrectedCorrectWords); if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound) { unItalicText = modiTextOcrFixed; modiWordsNotFound = modiOcrCorrectedWordsNotFound; modiCorrectWords = modiOcrCorrectedCorrectWords; } bool ok = modiWordsNotFound < wordsNotFound || (textWithOutFixes.Length == 1 && modiWordsNotFound == 0); if (!ok) ok = wordsNotFound == modiWordsNotFound && unItalicText.EndsWith('!') && (line.EndsWith('l') || line.EndsWith('fl')); if (!ok) ok = wordsNotFound == modiWordsNotFound && line.StartsWith("<i>") && line.EndsWith("</i>"); if (ok && Utilities.CountTagInText(unItalicText, '/') > Utilities.CountTagInText(line, '/') + 1) ok = false; if (ok && Utilities.CountTagInText(unItalicText, '\\') > Utilities.CountTagInText(line, '\\')) ok = false; if (ok && Utilities.CountTagInText(unItalicText, ')') > Utilities.CountTagInText(line, ')') + 1) ok = false; if (ok && Utilities.CountTagInText(unItalicText, '(') > Utilities.CountTagInText(line, '(') + 1) ok = false; if (ok && Utilities.CountTagInText(unItalicText, '$') > Utilities.CountTagInText(line, '$') + 1) ok = false; if (ok && Utilities.CountTagInText(unItalicText, '€') > Utilities.CountTagInText(line, '€') + 1) ok = false; if (ok && Utilities.CountTagInText(unItalicText, '•') > Utilities.CountTagInText(line, '•')) ok = false; if (ok) { wordsNotFound = modiWordsNotFound; correctWords = modiCorrectWords; line = HtmlUtil.RemoveOpenCloseTags(line, HtmlUtil.TagItalic).Trim(); if (line.Length > 7 && unItalicText.Length > 7 && unItalicText.StartsWith("I ") && line.StartsWith(unItalicText.Remove(0, 2).Substring(0, 4))) unItalicText = unItalicText.Remove(0, 2); if (checkBoxTesseractMusicOn.Checked) { if ((line.StartsWith("J' ") || line.StartsWith("J“ ") || line.StartsWith("J* ") || line.StartsWith("♪ ")) && unItalicText.Length > 3 && HtmlUtil.RemoveOpenCloseTags(unItalicText, HtmlUtil.TagItalic).Substring(1, 2) == "' ") { unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart(); } if ((line.StartsWith("J' ") || line.StartsWith("J“ ") || line.StartsWith("J* ") || line.StartsWith("♪ ")) && unItalicText.Length > 3 && HtmlUtil.RemoveOpenCloseTags(unItalicText, HtmlUtil.TagItalic)[1] == ' ') { bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>"); unItalicText = HtmlUtil.RemoveHtmlTags(unItalicText); unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart(); if (ita) unItalicText = "<i>" + unItalicText + "</i>"; } if ((line.StartsWith("J' ") || line.StartsWith("J“ ") || line.StartsWith("J* ") || line.StartsWith("♪ ")) && unItalicText.Length > 3 && HtmlUtil.RemoveOpenCloseTags(unItalicText, HtmlUtil.TagItalic)[2] == ' ') { bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>"); unItalicText = HtmlUtil.RemoveHtmlTags(unItalicText); unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart(); if (ita) unItalicText = "<i>" + unItalicText + "</i>"; } if (unItalicText.StartsWith("J'") && (line.StartsWith('♪') || textWithOutFixes.StartsWith('♪') || textWithOutFixes.StartsWith("<i>♪") || unItalicText.EndsWith('♪'))) { bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>"); unItalicText = HtmlUtil.RemoveHtmlTags(unItalicText); unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart(); if (ita) unItalicText = "<i>" + unItalicText + "</i>"; } if ((line.StartsWith("J` ") || line.StartsWith("J“ ") || line.StartsWith("J' ") || line.StartsWith("J* ")) && unItalicText.StartsWith("S ")) { bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>"); unItalicText = HtmlUtil.RemoveHtmlTags(unItalicText); unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart(); if (ita) unItalicText = "<i>" + unItalicText + "</i>"; } if ((line.StartsWith("J` ") || line.StartsWith("J“ ") || line.StartsWith("J' ") || line.StartsWith("J* ")) && unItalicText.StartsWith("<i>S</i> ")) { bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>"); unItalicText = HtmlUtil.RemoveHtmlTags(unItalicText); unItalicText = "♪ " + unItalicText.Remove(0, 8).TrimStart(); if (ita) unItalicText = "<i>" + unItalicText + "</i>"; } if (unItalicText.StartsWith(";'") && (line.StartsWith('♪') || textWithOutFixes.StartsWith('♪') || textWithOutFixes.StartsWith("<i>♪") || unItalicText.EndsWith('♪'))) { bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>"); unItalicText = HtmlUtil.RemoveHtmlTags(unItalicText); unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart(); if (ita) unItalicText = "<i>" + unItalicText + "</i>"; } if (unItalicText.StartsWith(",{*") && (line.StartsWith('♪') || textWithOutFixes.StartsWith('♪') || textWithOutFixes.StartsWith("<i>♪") || unItalicText.EndsWith('♪'))) { bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>"); unItalicText = HtmlUtil.RemoveHtmlTags(unItalicText); unItalicText = "♪ " + unItalicText.Remove(0, 3).TrimStart(); if (ita) unItalicText = "<i>" + unItalicText + "</i>"; } if (unItalicText.EndsWith("J'") && (line.EndsWith('♪') || textWithOutFixes.EndsWith('♪') || textWithOutFixes.EndsWith("♪</i>") || unItalicText.StartsWith('♪'))) { bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>"); unItalicText = HtmlUtil.RemoveHtmlTags(unItalicText); unItalicText = unItalicText.Remove(unItalicText.Length - 3, 2).TrimEnd() + " ♪"; if (ita) unItalicText = "<i>" + unItalicText + "</i>"; } } if (unItalicText.StartsWith('[') && !line.StartsWith('[')) { unItalicText = unItalicText.Remove(0, 1); if (unItalicText.EndsWith(']')) unItalicText = unItalicText.TrimEnd(']'); } if (unItalicText.StartsWith('{') && !line.StartsWith('{')) { unItalicText = unItalicText.Remove(0, 1); if (unItalicText.EndsWith('}')) unItalicText = unItalicText.TrimEnd('}'); } if (unItalicText.EndsWith('}') && !line.EndsWith('}')) unItalicText = unItalicText.TrimEnd('}'); if (line.EndsWith("...") && unItalicText.EndsWith("”!")) unItalicText = unItalicText.TrimEnd('!').TrimEnd('”') + "."; if (line.EndsWith("...") && unItalicText.EndsWith("\"!")) unItalicText = unItalicText.TrimEnd('!').TrimEnd('"') + "."; if (line.EndsWith('.') && !unItalicText.EndsWith('.') && !unItalicText.EndsWith(".</i>")) { string post = string.Empty; if (unItalicText.EndsWith("</i>")) { post = "</i>"; unItalicText = unItalicText.Remove(unItalicText.Length - 4); } if (unItalicText.EndsWith('\'') && !line.EndsWith("'.")) unItalicText = unItalicText.TrimEnd('\''); unItalicText += "." + post; } if (unItalicText.EndsWith('.') && !unItalicText.EndsWith("...") && !unItalicText.EndsWith("...</i>") && line.EndsWith("...")) { string post = string.Empty; if (unItalicText.EndsWith("</i>")) { post = "</i>"; unItalicText = unItalicText.Remove(unItalicText.Length - 4); } unItalicText += ".." + post; } if (unItalicText.EndsWith("..") && !unItalicText.EndsWith("...") && !unItalicText.EndsWith("...</i>") && line.EndsWith("...")) { string post = string.Empty; if (unItalicText.EndsWith("</i>")) { post = "</i>"; unItalicText = unItalicText.Remove(unItalicText.Length - 4); } unItalicText += "." + post; } if (line.EndsWith('!') && !unItalicText.EndsWith('!') && !unItalicText.EndsWith("!</i>")) { if (unItalicText.EndsWith("!'")) { unItalicText = unItalicText.TrimEnd('\''); } else { if (unItalicText.EndsWith("l</i>") && _ocrFixEngine != null) { string w = unItalicText.Substring(0, unItalicText.Length - 4); int wIdx = w.Length - 1; while (wIdx >= 0 && !@" .,!?<>:;'-$@£()[]<>/""".Contains(w[wIdx])) { wIdx--; } if (wIdx + 1 < w.Length && unItalicText.Length > 5) { w = w.Substring(wIdx + 1); if (!_ocrFixEngine.DoSpell(w)) unItalicText = unItalicText.Remove(unItalicText.Length - 5, 1); } unItalicText = unItalicText.Insert(unItalicText.Length - 4, "!"); } else if (unItalicText.EndsWith('l') && _ocrFixEngine != null) { string w = unItalicText; int wIdx = w.Length - 1; while (wIdx >= 0 && !@" .,!?<>:;'-$@£()[]<>/""".Contains(w[wIdx])) { wIdx--; } if (wIdx + 1 < w.Length && unItalicText.Length > 5) { w = w.Substring(wIdx + 1); if (!_ocrFixEngine.DoSpell(w)) unItalicText = unItalicText.Remove(unItalicText.Length - 1, 1); } unItalicText += "!"; } else { unItalicText += "!"; } } } if (line.EndsWith('?') && !unItalicText.EndsWith('?') && !unItalicText.EndsWith("?</i>")) { if (unItalicText.EndsWith("?'")) unItalicText = unItalicText.TrimEnd('\''); else unItalicText += "?"; } line = HtmlUtil.RemoveOpenCloseTags(unItalicText, HtmlUtil.TagItalic); if (checkBoxAutoFixCommonErrors.Checked) { if (line.Contains("'.") && !textWithOutFixes.Contains("'.") && textWithOutFixes.Contains(':') && !line.EndsWith("'.") && Configuration.Settings.Tools.OcrFixUseHardcodedRules) { line = line.Replace("'.", ":"); } line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel()); } line = "<i>" + line + "</i>"; } else { unItalicText = unItalicText.Replace("</i>", string.Empty); if (line.EndsWith("</i>", StringComparison.Ordinal) && unItalicText.EndsWith('.')) { line = line.Remove(line.Length - 4, 4); if (line.EndsWith('-')) line = line.TrimEnd('-') + "."; if (char.IsLetter(line[line.Length - 1])) line += "."; line += "</i>"; } } } } } if (checkBoxTesseractMusicOn.Checked) { if (line == "[J'J'J~]" || line == "[J'J'J']") line = "[ ♪ ♪ ♪ ]"; line = line.Replace(" J' ", " ♪ "); if (line.StartsWith("J'")) { line = "♪ " + line.Remove(0, 2).TrimStart(); } if (line.StartsWith("<i>J'")) { line = "<i>♪ " + line.Remove(0, 5).TrimStart(); } if (line.StartsWith("[J'")) { line = "[♪ " + line.Remove(0, 3).TrimStart(); } if (line.StartsWith("<i>[J'")) { line = "<i>[♪ " + line.Remove(0, 6).TrimStart(); } if (line.EndsWith("J'")) { line = line.Remove(line.Length - 2, 2).TrimEnd() + " ♪"; } if (line.EndsWith("J'</i>")) { line = line.Remove(line.Length - 6, 6).TrimEnd() + " ♪</i>"; } if (line.Contains(Environment.NewLine + "J'")) { line = line.Replace(Environment.NewLine + "J'", Environment.NewLine + "♪ "); line = line.Replace(" ", " "); } if (line.Contains("J'" + Environment.NewLine)) { line = line.Replace("J'" + Environment.NewLine, " ♪" + Environment.NewLine); line = line.Replace(" ", " "); } } if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && textWithOutFixes.Replace("~", string.Empty).Trim().Length < 2) { _ocrFixEngine.AutoGuessesUsed.Clear(); _ocrFixEngine.UnknownWordsFound.Clear(); if (_modiEnabled && checkBoxUseModiInTesseractForUnknownWords.Checked) { // which is best - modi or Tesseract - we find out here string modiText = CallModi(index); if (modiText.Length == 0) modiText = CallModi(index); // retry... strange MODI if (modiText.Length == 0) modiText = CallModi(index); // retry... strange MODI if (modiText.Length > 1 && !modiText.Contains("CD") && (!modiText.Contains('0') || line.Contains('0')) && (!modiText.Contains('2') || line.Contains('2')) && (!modiText.Contains('3') || line.Contains('4')) && (!modiText.Contains('5') || line.Contains('5')) && (!modiText.Contains('9') || line.Contains('9')) && (!modiText.Contains('•') || line.Contains('•')) && (!modiText.Contains(')') || line.Contains(')')) && Utilities.CountTagInText(modiText, '(') < 2 && Utilities.CountTagInText(modiText, ')') < 2 && Utilities.GetNumberOfLines(modiText) < 4) { int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiText, out correctWords); //if (modiWordsNotFound > 0) { string modiTextOcrFixed = modiText; if (checkBoxAutoFixCommonErrors.Checked) modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(modiText, index, _lastLine, false, GetAutoGuessLevel()); int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out correctWords); if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound) modiText = modiTextOcrFixed; } if (modiWordsNotFound < wordsNotFound || (textWithOutFixes.Length == 1 && modiWordsNotFound == 0)) line = modiText; // use the modi OCR'ed text else if (wordsNotFound == modiWordsNotFound && modiText.EndsWith('!') && (line.EndsWith('l') || line.EndsWith('fl'))) line = modiText; } // take the best option - before OCR fixing, which we do again to save suggestions and prompt for user input line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel()); } else { // fix some error manually (modi not available) line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel()); } } if (_ocrFixEngine.Abort) { ButtonStopClick(null, null); _ocrFixEngine.Abort = false; return string.Empty; } //check Tesseract... find an other way to do this... //string tmp = HtmlUtil.RemoveHtmlTags(line).Trim(); //if (!tmp.TrimEnd().EndsWith("...")) //{ // tmp = tmp.TrimEnd('.').TrimEnd(); // if (tmp.Length > 2 && Utilities.LowercaseLetters.Contains(tmp[tmp.Length - 1])) // { // if (_nocrChars == null) // _nocrChars = LoadNOcrForTesseract("Nikse.SubtitleEdit.Resources.nOCR_TesseractHelper.xml.zip"); // string text = HtmlUtil.RemoveHtmlTags(NocrFastCheck(bitmap).TrimEnd()); // string post = string.Empty; // if (line.EndsWith("</i>")) // { // post = "</i>"; // line = line.Remove(line.Length - 4, 4).Trim(); // } // if (text.EndsWith('.')) // { // line = line.TrimEnd('.').Trim(); // while (text.EndsWith('.') || text.EndsWith(' ')) // { // line += text.Substring(text.Length - 1).Trim(); // text = text.Remove(text.Length - 1, 1); // } // } // else if (text.EndsWith('l') && text.EndsWith('!') && !text.EndsWith("l!")) // { // line = line.Remove(line.Length - 1, 1) + "!"; // } // line += post; // } //} // Log used word guesses (via word replace list) foreach (string guess in _ocrFixEngine.AutoGuessesUsed) listBoxLogSuggestions.Items.Add(guess); _ocrFixEngine.AutoGuessesUsed.Clear(); // Log unkown words guess (found via spelling dictionaries) LogUnknownWords(); ColorLineByNumberOfUnknownWords(index, wordsNotFound, line); } else { // no dictionary :( if (checkBoxAutoFixCommonErrors.Checked) line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel()); if (badWords >= numberOfWords) subtitleListView1.SetBackgroundColor(index, Color.Red); else if (badWords >= numberOfWords / 2) subtitleListView1.SetBackgroundColor(index, Color.Orange); else if (badWords > 0 || line.Contains('_') || HasSingleLetters(line)) subtitleListView1.SetBackgroundColor(index, Color.Yellow); else if (string.IsNullOrWhiteSpace(HtmlUtil.RemoveOpenCloseTags(line, HtmlUtil.TagItalic))) subtitleListView1.SetBackgroundColor(index, Color.Orange); else subtitleListView1.SetBackgroundColor(index, Color.LightGreen); } if (textWithOutFixes.Trim() != line.Trim()) { _tesseractOcrAutoFixes++; labelFixesMade.Text = string.Format(" - {0}", _tesseractOcrAutoFixes); LogOcrFix(index, textWithOutFixes, line); } if (_vobSubMergedPackist != null) bitmap.Dispose(); return line; }