Пример #1
0
        private string OcrViaTesseract(Bitmap bitmap, int index)
        {
            if (bitmap == null)
                return string.Empty;

            if (_ocrFixEngine == null)
                comboBoxDictionaries_SelectedIndexChanged(null, null);

            const int badWords = 0;
            string textWithOutFixes;
            if (_tesseractAsyncStrings != null && !string.IsNullOrEmpty(_tesseractAsyncStrings[index]))
            {
                textWithOutFixes = _tesseractAsyncStrings[index];
            }
            else
            {
                if (_tesseractAsyncIndex <= index)
                    _tesseractAsyncIndex = index + 10;
                textWithOutFixes = Tesseract3DoOcrViaExe(bitmap, _languageId, "-psm 6"); // 6 = Assume a single uniform block of text.
            }

            if ((!textWithOutFixes.Contains(Environment.NewLine) || Utilities.CountTagInText(textWithOutFixes, '\n') > 2)
                && (textWithOutFixes.Length < 17 || bitmap.Height < 50))
            {
                string psm = Tesseract3DoOcrViaExe(bitmap, _languageId, "-psm 7"); // 7 = Treat the image as a single text line.
                if (textWithOutFixes != psm)
                {
                    if (string.IsNullOrWhiteSpace(textWithOutFixes))
                    {
                        textWithOutFixes = psm;
                    }
                    else if (psm.Length > textWithOutFixes.Length)
                    {
                        if (!psm.Contains('9') && textWithOutFixes.Contains('9') ||
                            !psm.Contains('6') && textWithOutFixes.Contains('6') ||
                            !psm.Contains('5') && textWithOutFixes.Contains('5') ||
                            !psm.Contains('3') && textWithOutFixes.Contains('3') ||
                            !psm.Contains('1') && textWithOutFixes.Contains('1') ||
                            !psm.Contains('$') && textWithOutFixes.Contains('$') ||
                            !psm.Contains('•') && textWithOutFixes.Contains('•') ||
                            !psm.Contains('Y') && textWithOutFixes.Contains('Y') ||
                            !psm.Contains('\'') && textWithOutFixes.Contains('\'') ||
                            !psm.Contains('€') && textWithOutFixes.Contains('€'))
                        {
                            textWithOutFixes = psm;
                        }
                        else if (_ocrFixEngine != null && !psm.Contains('$') && !psm.Contains('•') && !psm.Contains('€'))
                        {
                            int correctWordsNoFixes;
                            int wordsNotFoundNoFixes = _ocrFixEngine.CountUnknownWordsViaDictionary(textWithOutFixes, out correctWordsNoFixes);
                            int correctWordsPsm7;
                            int wordsNotFoundPsm7 = _ocrFixEngine.CountUnknownWordsViaDictionary(psm, out correctWordsPsm7);
                            if (wordsNotFoundPsm7 <= wordsNotFoundNoFixes && correctWordsPsm7 > correctWordsNoFixes)
                            {
                                textWithOutFixes = psm;
                            }
                        }
                    }
                    else if (psm.Length == textWithOutFixes.Length &&
                             (!psm.Contains('0') && textWithOutFixes.Contains('0') ||  // these chars are often mistaken
                              !psm.Contains('9') && textWithOutFixes.Contains('9') ||
                              !psm.Contains('8') && textWithOutFixes.Contains('8') ||
                              !psm.Contains('5') && textWithOutFixes.Contains('5') ||
                              !psm.Contains('3') && textWithOutFixes.Contains('3') ||
                              !psm.Contains('1') && textWithOutFixes.Contains('1') ||
                              !psm.Contains('$') && textWithOutFixes.Contains('$') ||
                              !psm.Contains('€') && textWithOutFixes.Contains('€') ||
                              !psm.Contains('•') && textWithOutFixes.Contains('•') ||
                              !psm.Contains('Y') && textWithOutFixes.Contains('Y') ||
                              !psm.Contains('\'') && textWithOutFixes.Contains('\'') ||
                              !psm.Contains('/') && textWithOutFixes.Contains('/') ||
                              !psm.Contains('(') && textWithOutFixes.Contains('(') ||
                              !psm.Contains(')') && textWithOutFixes.Contains(')') ||
                              !psm.Contains('_') && textWithOutFixes.Contains('_')))
                    {
                        textWithOutFixes = psm;
                    }
                    else if (psm.Length == textWithOutFixes.Length && psm.EndsWith('.') && !textWithOutFixes.EndsWith('.'))
                    {
                        textWithOutFixes = psm;
                    }
                }
            }
            if (!checkBoxTesseractItalicsOn.Checked)
                textWithOutFixes = HtmlUtil.RemoveOpenCloseTags(textWithOutFixes, HtmlUtil.TagItalic);

            // Sometimes Tesseract has problems with small fonts - it helps to make the image larger
            if (HtmlUtil.RemoveOpenCloseTags(textWithOutFixes, HtmlUtil.TagItalic).Replace("@", string.Empty).Replace("%", string.Empty).Replace("|", string.Empty).Trim().Length < 3
                || Utilities.CountTagInText(textWithOutFixes, '\n') > 2)
            {
                string rs = TesseractResizeAndRetry(bitmap);
                textWithOutFixes = rs;
                if (!checkBoxTesseractItalicsOn.Checked)
                    textWithOutFixes = HtmlUtil.RemoveOpenCloseTags(textWithOutFixes, HtmlUtil.TagItalic);
            }

            // fix italics
            textWithOutFixes = FixItalics(textWithOutFixes);

            int numberOfWords = textWithOutFixes.Split((" " + Environment.NewLine).ToCharArray(), StringSplitOptions.RemoveEmptyEntries).Length;

            string line = textWithOutFixes.Trim();
            if (_ocrFixEngine.IsDictionaryLoaded)
            {
                if (checkBoxAutoFixCommonErrors.Checked)
                    line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel());
                int correctWords;
                int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords);
                int oldCorrectWords = correctWords;

                if (wordsNotFound > 0 || correctWords == 0)
                {
                    List<string> oldUnkownWords = new List<string>();
                    oldUnkownWords.AddRange(_ocrFixEngine.UnknownWordsFound);
                    _ocrFixEngine.UnknownWordsFound.Clear();

                    string newUnfixedText = TesseractResizeAndRetry(bitmap);
                    string newText = _ocrFixEngine.FixOcrErrors(newUnfixedText, index, _lastLine, true, GetAutoGuessLevel());
                    int newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText, out correctWords);

                    if (wordsNotFound == 1 && newWordsNotFound == 1 && newUnfixedText.EndsWith("!!") && textWithOutFixes.EndsWith('u') && newText.Length > 1)
                    {
                        _ocrFixEngine.UnknownWordsFound.Clear();
                        newText = textWithOutFixes.Substring(0, textWithOutFixes.Length - 1) + "!!";
                        newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText, out correctWords);
                    }
                    else if (correctWords >= oldCorrectWords &&
                             (!newText.Contains('9') || textWithOutFixes.Contains('9')) &&
                             (!newText.Replace("</i>", string.Empty).Contains('/') || textWithOutFixes.Replace("</i>", string.Empty).Contains('/')) &&
                             !string.IsNullOrWhiteSpace(newUnfixedText) &&
                             newWordsNotFound < wordsNotFound || (newWordsNotFound == wordsNotFound && newText.EndsWith('!') && textWithOutFixes.EndsWith('l')))
                    {
                        wordsNotFound = newWordsNotFound;
                        if (textWithOutFixes.Length > 3 && textWithOutFixes.EndsWith("...") && !newText.EndsWith('.') && !newText.EndsWith(',') && !newText.EndsWith('!') &&
                            !newText.EndsWith('?') && !newText.EndsWith("</i>"))
                            newText = newText.TrimEnd() + "...";
                        else if (textWithOutFixes.Length > 0 && textWithOutFixes.EndsWith('.') && !newText.EndsWith('.') && !newText.EndsWith(',') && !newText.EndsWith('!') &&
                            !newText.EndsWith('?') && !newText.EndsWith("</i>"))
                            newText = newText.TrimEnd() + ".";
                        else if (textWithOutFixes.Length > 0 && textWithOutFixes.EndsWith('?') && !newText.EndsWith('.') && !newText.EndsWith(',') && !newText.EndsWith('!') &&
                            !newText.EndsWith('?') && !newText.EndsWith("</i>"))
                            newText = newText.TrimEnd() + "?";

                        textWithOutFixes = newUnfixedText;
                        line = FixItalics(newText);
                    }
                    else if (correctWords > oldCorrectWords + 1 || (correctWords > oldCorrectWords && !textWithOutFixes.Contains(' ')))
                    {
                        wordsNotFound = newWordsNotFound;
                        textWithOutFixes = newUnfixedText;
                        line = newText;
                    }
                    else
                    {
                        _ocrFixEngine.UnknownWordsFound.Clear();
                        _ocrFixEngine.UnknownWordsFound.AddRange(oldUnkownWords);
                    }
                }

                if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && textWithOutFixes.Replace("~", string.Empty).Trim().Length < 2)
                {
                    if (_bluRaySubtitles != null && !line.Contains("<i>"))
                    {
                        _ocrFixEngine.AutoGuessesUsed.Clear();
                        _ocrFixEngine.UnknownWordsFound.Clear();

                        // which is best - normal image or one color image?
                        var nbmp = new NikseBitmap(bitmap);
                        nbmp.MakeOneColor(Color.White);
                        Bitmap oneColorBitmap = nbmp.GetBitmap();
                        string oneColorText = Tesseract3DoOcrViaExe(oneColorBitmap, _languageId, "-psm 6"); // 6 = Assume a single uniform block of text.
                        oneColorBitmap.Dispose();
                        nbmp = null;

                        if (oneColorText.Length > 1 &&
                            !oneColorText.Contains("CD") &&
                            (!oneColorText.Contains('0') || line.Contains('0')) &&
                            (!oneColorText.Contains('2') || line.Contains('2')) &&
                            (!oneColorText.Contains('3') || line.Contains('4')) &&
                            (!oneColorText.Contains('5') || line.Contains('5')) &&
                            (!oneColorText.Contains('9') || line.Contains('9')) &&
                            (!oneColorText.Contains('•') || line.Contains('•')) &&
                            (!oneColorText.Contains(')') || line.Contains(')')) &&
                            Utilities.CountTagInText(oneColorText, '(') < 2 && Utilities.CountTagInText(oneColorText, ')') < 2 &&
                            Utilities.GetNumberOfLines(oneColorText) < 4)
                        {
                            int modiCorrectWords;
                            int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(oneColorText, out modiCorrectWords);
                            string modiTextOcrFixed = oneColorText;
                            if (checkBoxAutoFixCommonErrors.Checked)
                                modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(oneColorText, index, _lastLine, false, GetAutoGuessLevel());
                            int modiOcrCorrectedCorrectWords;
                            int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out modiOcrCorrectedCorrectWords);
                            if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound)
                            {
                                oneColorText = modiTextOcrFixed;
                                modiWordsNotFound = modiOcrCorrectedWordsNotFound;
                                modiCorrectWords = modiOcrCorrectedCorrectWords;
                            }

                            if (modiWordsNotFound < wordsNotFound || (textWithOutFixes.Length == 1 && modiWordsNotFound == 0))
                            {
                                line = FixItalics(oneColorText); // use one-color text
                                wordsNotFound = modiWordsNotFound;
                                correctWords = modiCorrectWords;
                                if (checkBoxAutoFixCommonErrors.Checked)
                                    line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel());
                            }
                            else if (wordsNotFound == modiWordsNotFound && oneColorText.EndsWith('!') && (line.EndsWith('l') || line.EndsWith('fl')))
                            {
                                line = FixItalics(oneColorText);
                                wordsNotFound = modiWordsNotFound;
                                correctWords = modiCorrectWords;
                                if (checkBoxAutoFixCommonErrors.Checked)
                                    line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel());
                            }
                        }
                    }
                }

                if (checkBoxTesseractItalicsOn.Checked)
                {
                    if (line.Contains("<i>") || wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && textWithOutFixes.Replace("~", string.Empty).Trim().Length < 2)
                    {
                        _ocrFixEngine.AutoGuessesUsed.Clear();
                        _ocrFixEngine.UnknownWordsFound.Clear();

                        // which is best - normal image or de-italic'ed? We find out here
                        var unItalicedBmp = UnItalic(bitmap, _unItalicFactor);
                        string unItalicText = Tesseract3DoOcrViaExe(unItalicedBmp, _languageId, "-psm 6"); // 6 = Assume a single uniform block of text.
                        unItalicedBmp.Dispose();

                        if (unItalicText.Length > 1)
                        {
                            int modiCorrectWords;
                            int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(unItalicText, out modiCorrectWords);
                            string modiTextOcrFixed = unItalicText;
                            if (checkBoxAutoFixCommonErrors.Checked)
                                modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(unItalicText, index, _lastLine, false, GetAutoGuessLevel());
                            int modiOcrCorrectedCorrectWords;
                            int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out modiOcrCorrectedCorrectWords);
                            if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound)
                            {
                                unItalicText = modiTextOcrFixed;
                                modiWordsNotFound = modiOcrCorrectedWordsNotFound;
                                modiCorrectWords = modiOcrCorrectedCorrectWords;
                            }

                            bool ok = modiWordsNotFound < wordsNotFound || (textWithOutFixes.Length == 1 && modiWordsNotFound == 0);

                            if (!ok)
                                ok = wordsNotFound == modiWordsNotFound && unItalicText.EndsWith('!') && (line.EndsWith('l') || line.EndsWith('fl'));

                            if (!ok)
                                ok = wordsNotFound == modiWordsNotFound && line.StartsWith("<i>") && line.EndsWith("</i>");

                            if (ok && Utilities.CountTagInText(unItalicText, '/') > Utilities.CountTagInText(line, '/') + 1)
                                ok = false;
                            if (ok && Utilities.CountTagInText(unItalicText, '\\') > Utilities.CountTagInText(line, '\\'))
                                ok = false;
                            if (ok && Utilities.CountTagInText(unItalicText, ')') > Utilities.CountTagInText(line, ')') + 1)
                                ok = false;
                            if (ok && Utilities.CountTagInText(unItalicText, '(') > Utilities.CountTagInText(line, '(') + 1)
                                ok = false;
                            if (ok && Utilities.CountTagInText(unItalicText, '$') > Utilities.CountTagInText(line, '$') + 1)
                                ok = false;
                            if (ok && Utilities.CountTagInText(unItalicText, '€') > Utilities.CountTagInText(line, '€') + 1)
                                ok = false;
                            if (ok && Utilities.CountTagInText(unItalicText, '•') > Utilities.CountTagInText(line, '•'))
                                ok = false;

                            if (ok)
                            {
                                wordsNotFound = modiWordsNotFound;
                                correctWords = modiCorrectWords;

                                line = HtmlUtil.RemoveOpenCloseTags(line, HtmlUtil.TagItalic).Trim();

                                if (line.Length > 7 && unItalicText.Length > 7 && unItalicText.StartsWith("I ") &&
                                    line.StartsWith(unItalicText.Remove(0, 2).Substring(0, 4)))
                                    unItalicText = unItalicText.Remove(0, 2);

                                if (checkBoxTesseractMusicOn.Checked)
                                {
                                    if ((line.StartsWith("J' ") || line.StartsWith("J“ ") || line.StartsWith("J* ") || line.StartsWith("♪ ")) && unItalicText.Length > 3 && HtmlUtil.RemoveOpenCloseTags(unItalicText, HtmlUtil.TagItalic).Substring(1, 2) == "' ")
                                    {
                                        unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart();
                                    }
                                    if ((line.StartsWith("J' ") || line.StartsWith("J“ ") || line.StartsWith("J* ") || line.StartsWith("♪ ")) && unItalicText.Length > 3 && HtmlUtil.RemoveOpenCloseTags(unItalicText, HtmlUtil.TagItalic)[1] == ' ')
                                    {
                                        bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>");
                                        unItalicText = HtmlUtil.RemoveHtmlTags(unItalicText);
                                        unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart();
                                        if (ita)
                                            unItalicText = "<i>" + unItalicText + "</i>";
                                    }
                                    if ((line.StartsWith("J' ") || line.StartsWith("J“ ") || line.StartsWith("J* ") || line.StartsWith("♪ ")) && unItalicText.Length > 3 && HtmlUtil.RemoveOpenCloseTags(unItalicText, HtmlUtil.TagItalic)[2] == ' ')
                                    {
                                        bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>");
                                        unItalicText = HtmlUtil.RemoveHtmlTags(unItalicText);
                                        unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart();
                                        if (ita)
                                            unItalicText = "<i>" + unItalicText + "</i>";
                                    }
                                    if (unItalicText.StartsWith("J'") && (line.StartsWith('♪') || textWithOutFixes.StartsWith('♪') || textWithOutFixes.StartsWith("<i>♪") || unItalicText.EndsWith('♪')))
                                    {
                                        bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>");
                                        unItalicText = HtmlUtil.RemoveHtmlTags(unItalicText);
                                        unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart();
                                        if (ita)
                                            unItalicText = "<i>" + unItalicText + "</i>";
                                    }
                                    if ((line.StartsWith("J` ") || line.StartsWith("J“ ") || line.StartsWith("J' ") || line.StartsWith("J* ")) && unItalicText.StartsWith("S "))
                                    {
                                        bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>");
                                        unItalicText = HtmlUtil.RemoveHtmlTags(unItalicText);
                                        unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart();
                                        if (ita)
                                            unItalicText = "<i>" + unItalicText + "</i>";
                                    }
                                    if ((line.StartsWith("J` ") || line.StartsWith("J“ ") || line.StartsWith("J' ") || line.StartsWith("J* ")) && unItalicText.StartsWith("<i>S</i> "))
                                    {
                                        bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>");
                                        unItalicText = HtmlUtil.RemoveHtmlTags(unItalicText);
                                        unItalicText = "♪ " + unItalicText.Remove(0, 8).TrimStart();
                                        if (ita)
                                            unItalicText = "<i>" + unItalicText + "</i>";
                                    }
                                    if (unItalicText.StartsWith(";'") && (line.StartsWith('♪') || textWithOutFixes.StartsWith('♪') || textWithOutFixes.StartsWith("<i>♪") || unItalicText.EndsWith('♪')))
                                    {
                                        bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>");
                                        unItalicText = HtmlUtil.RemoveHtmlTags(unItalicText);
                                        unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart();
                                        if (ita)
                                            unItalicText = "<i>" + unItalicText + "</i>";
                                    }
                                    if (unItalicText.StartsWith(",{*") && (line.StartsWith('♪') || textWithOutFixes.StartsWith('♪') || textWithOutFixes.StartsWith("<i>♪") || unItalicText.EndsWith('♪')))
                                    {
                                        bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>");
                                        unItalicText = HtmlUtil.RemoveHtmlTags(unItalicText);
                                        unItalicText = "♪ " + unItalicText.Remove(0, 3).TrimStart();
                                        if (ita)
                                            unItalicText = "<i>" + unItalicText + "</i>";
                                    }

                                    if (unItalicText.EndsWith("J'") && (line.EndsWith('♪') || textWithOutFixes.EndsWith('♪') || textWithOutFixes.EndsWith("♪</i>") || unItalicText.StartsWith('♪')))
                                    {
                                        bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>");
                                        unItalicText = HtmlUtil.RemoveHtmlTags(unItalicText);
                                        unItalicText = unItalicText.Remove(unItalicText.Length - 3, 2).TrimEnd() + " ♪";
                                        if (ita)
                                            unItalicText = "<i>" + unItalicText + "</i>";
                                    }
                                }

                                if (unItalicText.StartsWith('[') && !line.StartsWith('['))
                                {
                                    unItalicText = unItalicText.Remove(0, 1);
                                    if (unItalicText.EndsWith(']'))
                                        unItalicText = unItalicText.TrimEnd(']');
                                }
                                if (unItalicText.StartsWith('{') && !line.StartsWith('{'))
                                {
                                    unItalicText = unItalicText.Remove(0, 1);
                                    if (unItalicText.EndsWith('}'))
                                        unItalicText = unItalicText.TrimEnd('}');
                                }
                                if (unItalicText.EndsWith('}') && !line.EndsWith('}'))
                                    unItalicText = unItalicText.TrimEnd('}');

                                if (line.EndsWith("...") && unItalicText.EndsWith("”!"))
                                    unItalicText = unItalicText.TrimEnd('!').TrimEnd('”') + ".";
                                if (line.EndsWith("...") && unItalicText.EndsWith("\"!"))
                                    unItalicText = unItalicText.TrimEnd('!').TrimEnd('"') + ".";

                                if (line.EndsWith('.') && !unItalicText.EndsWith('.') && !unItalicText.EndsWith(".</i>"))
                                {
                                    string post = string.Empty;
                                    if (unItalicText.EndsWith("</i>"))
                                    {
                                        post = "</i>";
                                        unItalicText = unItalicText.Remove(unItalicText.Length - 4);
                                    }
                                    if (unItalicText.EndsWith('\'') && !line.EndsWith("'."))
                                        unItalicText = unItalicText.TrimEnd('\'');
                                    unItalicText += "." + post;
                                }
                                if (unItalicText.EndsWith('.') && !unItalicText.EndsWith("...") && !unItalicText.EndsWith("...</i>") && line.EndsWith("..."))
                                {
                                    string post = string.Empty;
                                    if (unItalicText.EndsWith("</i>"))
                                    {
                                        post = "</i>";
                                        unItalicText = unItalicText.Remove(unItalicText.Length - 4);
                                    }
                                    unItalicText += ".." + post;
                                }
                                if (unItalicText.EndsWith("..") && !unItalicText.EndsWith("...") && !unItalicText.EndsWith("...</i>") && line.EndsWith("..."))
                                {
                                    string post = string.Empty;
                                    if (unItalicText.EndsWith("</i>"))
                                    {
                                        post = "</i>";
                                        unItalicText = unItalicText.Remove(unItalicText.Length - 4);
                                    }
                                    unItalicText += "." + post;
                                }

                                if (line.EndsWith('!') && !unItalicText.EndsWith('!') && !unItalicText.EndsWith("!</i>"))
                                {
                                    if (unItalicText.EndsWith("!'"))
                                    {
                                        unItalicText = unItalicText.TrimEnd('\'');
                                    }
                                    else
                                    {
                                        if (unItalicText.EndsWith("l</i>") && _ocrFixEngine != null)
                                        {
                                            string w = unItalicText.Substring(0, unItalicText.Length - 4);
                                            int wIdx = w.Length - 1;
                                            while (wIdx >= 0 && !@" .,!?<>:;'-$@£()[]<>/""".Contains(w[wIdx]))
                                            {
                                                wIdx--;
                                            }
                                            if (wIdx + 1 < w.Length && unItalicText.Length > 5)
                                            {
                                                w = w.Substring(wIdx + 1);
                                                if (!_ocrFixEngine.DoSpell(w))
                                                    unItalicText = unItalicText.Remove(unItalicText.Length - 5, 1);
                                            }
                                            unItalicText = unItalicText.Insert(unItalicText.Length - 4, "!");
                                        }
                                        else if (unItalicText.EndsWith('l') && _ocrFixEngine != null)
                                        {
                                            string w = unItalicText;
                                            int wIdx = w.Length - 1;
                                            while (wIdx >= 0 && !@" .,!?<>:;'-$@£()[]<>/""".Contains(w[wIdx]))
                                            {
                                                wIdx--;
                                            }
                                            if (wIdx + 1 < w.Length && unItalicText.Length > 5)
                                            {
                                                w = w.Substring(wIdx + 1);
                                                if (!_ocrFixEngine.DoSpell(w))
                                                    unItalicText = unItalicText.Remove(unItalicText.Length - 1, 1);
                                            }
                                            unItalicText += "!";
                                        }
                                        else
                                        {
                                            unItalicText += "!";
                                        }
                                    }
                                }
                                if (line.EndsWith('?') && !unItalicText.EndsWith('?') && !unItalicText.EndsWith("?</i>"))
                                {
                                    if (unItalicText.EndsWith("?'"))
                                        unItalicText = unItalicText.TrimEnd('\'');
                                    else
                                        unItalicText += "?";
                                }

                                line = HtmlUtil.RemoveOpenCloseTags(unItalicText, HtmlUtil.TagItalic);
                                if (checkBoxAutoFixCommonErrors.Checked)
                                {
                                    if (line.Contains("'.") && !textWithOutFixes.Contains("'.") && textWithOutFixes.Contains(':') && !line.EndsWith("'.") && Configuration.Settings.Tools.OcrFixUseHardcodedRules)
                                    {
                                        line = line.Replace("'.", ":");
                                    }
                                    line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel());
                                }
                                line = "<i>" + line + "</i>";
                            }
                            else
                            {
                                unItalicText = unItalicText.Replace("</i>", string.Empty);
                                if (line.EndsWith("</i>", StringComparison.Ordinal) && unItalicText.EndsWith('.'))
                                {
                                    line = line.Remove(line.Length - 4, 4);
                                    if (line.EndsWith('-'))
                                        line = line.TrimEnd('-') + ".";
                                    if (char.IsLetter(line[line.Length - 1]))
                                        line += ".";
                                    line += "</i>";
                                }
                            }
                        }
                    }
                }

                if (checkBoxTesseractMusicOn.Checked)
                {
                    if (line == "[J'J'J~]" || line == "[J'J'J']")
                        line = "[ ♪ ♪ ♪ ]";

                    line = line.Replace(" J' ", " ♪ ");

                    if (line.StartsWith("J'"))
                    {
                        line = "♪ " + line.Remove(0, 2).TrimStart();
                    }
                    if (line.StartsWith("<i>J'"))
                    {
                        line = "<i>♪ " + line.Remove(0, 5).TrimStart();
                    }
                    if (line.StartsWith("[J'"))
                    {
                        line = "[♪ " + line.Remove(0, 3).TrimStart();
                    }
                    if (line.StartsWith("<i>[J'"))
                    {
                        line = "<i>[♪ " + line.Remove(0, 6).TrimStart();
                    }
                    if (line.EndsWith("J'"))
                    {
                        line = line.Remove(line.Length - 2, 2).TrimEnd() + " ♪";
                    }
                    if (line.EndsWith("J'</i>"))
                    {
                        line = line.Remove(line.Length - 6, 6).TrimEnd() + " ♪</i>";
                    }
                    if (line.Contains(Environment.NewLine + "J'"))
                    {
                        line = line.Replace(Environment.NewLine + "J'", Environment.NewLine + "♪ ");
                        line = line.Replace("  ", " ");
                    }
                    if (line.Contains("J'" + Environment.NewLine))
                    {
                        line = line.Replace("J'" + Environment.NewLine, " ♪" + Environment.NewLine);
                        line = line.Replace("  ", " ");
                    }
                }

                if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && textWithOutFixes.Replace("~", string.Empty).Trim().Length < 2)
                {
                    _ocrFixEngine.AutoGuessesUsed.Clear();
                    _ocrFixEngine.UnknownWordsFound.Clear();

                    if (_modiEnabled && checkBoxUseModiInTesseractForUnknownWords.Checked)
                    {
                        // which is best - modi or Tesseract - we find out here
                        string modiText = CallModi(index);

                        if (modiText.Length == 0)
                            modiText = CallModi(index); // retry... strange MODI
                        if (modiText.Length == 0)
                            modiText = CallModi(index); // retry... strange MODI

                        if (modiText.Length > 1 &&
                            !modiText.Contains("CD") &&
                            (!modiText.Contains('0') || line.Contains('0')) &&
                            (!modiText.Contains('2') || line.Contains('2')) &&
                            (!modiText.Contains('3') || line.Contains('4')) &&
                            (!modiText.Contains('5') || line.Contains('5')) &&
                            (!modiText.Contains('9') || line.Contains('9')) &&
                            (!modiText.Contains('•') || line.Contains('•')) &&
                            (!modiText.Contains(')') || line.Contains(')')) &&
                            Utilities.CountTagInText(modiText, '(') < 2 && Utilities.CountTagInText(modiText, ')') < 2 &&
                            Utilities.GetNumberOfLines(modiText) < 4)
                        {
                            int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiText, out correctWords);
                            //if (modiWordsNotFound > 0)
                            {
                                string modiTextOcrFixed = modiText;
                                if (checkBoxAutoFixCommonErrors.Checked)
                                    modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(modiText, index, _lastLine, false, GetAutoGuessLevel());
                                int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out correctWords);
                                if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound)
                                    modiText = modiTextOcrFixed;
                            }

                            if (modiWordsNotFound < wordsNotFound || (textWithOutFixes.Length == 1 && modiWordsNotFound == 0))
                                line = modiText; // use the modi OCR'ed text
                            else if (wordsNotFound == modiWordsNotFound && modiText.EndsWith('!') && (line.EndsWith('l') || line.EndsWith('fl')))
                                line = modiText;
                        }

                        // take the best option - before OCR fixing, which we do again to save suggestions and prompt for user input
                        line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel());
                    }
                    else
                    { // fix some error manually (modi not available)
                        line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel());
                    }
                }

                if (_ocrFixEngine.Abort)
                {
                    ButtonStopClick(null, null);
                    _ocrFixEngine.Abort = false;
                    return string.Empty;
                }

                //check Tesseract... find an other way to do this...
                //string tmp = HtmlUtil.RemoveHtmlTags(line).Trim();
                //if (!tmp.TrimEnd().EndsWith("..."))
                //{
                //    tmp = tmp.TrimEnd('.').TrimEnd();
                //    if (tmp.Length > 2 && Utilities.LowercaseLetters.Contains(tmp[tmp.Length - 1]))
                //    {
                //        if (_nocrChars == null)
                //            _nocrChars = LoadNOcrForTesseract("Nikse.SubtitleEdit.Resources.nOCR_TesseractHelper.xml.zip");
                //        string text = HtmlUtil.RemoveHtmlTags(NocrFastCheck(bitmap).TrimEnd());
                //        string post = string.Empty;
                //        if (line.EndsWith("</i>"))
                //        {
                //            post = "</i>";
                //            line = line.Remove(line.Length - 4, 4).Trim();
                //        }
                //        if (text.EndsWith('.'))
                //        {
                //            line = line.TrimEnd('.').Trim();
                //            while (text.EndsWith('.') || text.EndsWith(' '))
                //            {
                //                line += text.Substring(text.Length - 1).Trim();
                //                text = text.Remove(text.Length - 1, 1);
                //            }
                //        }
                //        else if (text.EndsWith('l') && text.EndsWith('!') && !text.EndsWith("l!"))
                //        {
                //            line = line.Remove(line.Length - 1, 1) + "!";
                //        }
                //        line += post;
                //    }
                //}

                // Log used word guesses (via word replace list)
                foreach (string guess in _ocrFixEngine.AutoGuessesUsed)
                    listBoxLogSuggestions.Items.Add(guess);
                _ocrFixEngine.AutoGuessesUsed.Clear();

                // Log unkown words guess (found via spelling dictionaries)
                LogUnknownWords();

                ColorLineByNumberOfUnknownWords(index, wordsNotFound, line);
            }
            else
            { // no dictionary :(
                if (checkBoxAutoFixCommonErrors.Checked)
                    line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel());

                if (badWords >= numberOfWords)
                    subtitleListView1.SetBackgroundColor(index, Color.Red);
                else if (badWords >= numberOfWords / 2)
                    subtitleListView1.SetBackgroundColor(index, Color.Orange);
                else if (badWords > 0 || line.Contains('_') || HasSingleLetters(line))
                    subtitleListView1.SetBackgroundColor(index, Color.Yellow);
                else if (string.IsNullOrWhiteSpace(HtmlUtil.RemoveOpenCloseTags(line, HtmlUtil.TagItalic)))
                    subtitleListView1.SetBackgroundColor(index, Color.Orange);
                else
                    subtitleListView1.SetBackgroundColor(index, Color.LightGreen);
            }

            if (textWithOutFixes.Trim() != line.Trim())
            {
                _tesseractOcrAutoFixes++;
                labelFixesMade.Text = string.Format(" - {0}", _tesseractOcrAutoFixes);
                LogOcrFix(index, textWithOutFixes, line);
            }

            if (_vobSubMergedPackist != null)
                bitmap.Dispose();

            return line;
        }