Ejemplo n.º 1
0
        private string OcrViaTesseract(Bitmap bitmap, int index)
        {
            if (bitmap == null)
                return string.Empty;

            if (_ocrFixEngine == null)
                comboBoxDictionaries_SelectedIndexChanged(null, null);

            int badWords = 0;
            string textWithOutFixes;
            if (!string.IsNullOrEmpty(_tesseractAsyncStrings[index]))
            {
                textWithOutFixes = _tesseractAsyncStrings[index];
            }
            else
            {
                if (_tesseractAsyncIndex <= index)
                    _tesseractAsyncIndex = index + 10;
                textWithOutFixes = Tesseract3DoOcrViaExe(bitmap, _languageId, "-psm 6"); // 6 = Assume a single uniform block of text.
            }

            if ((!textWithOutFixes.Contains(Environment.NewLine) || Utilities.CountTagInText("\n", textWithOutFixes) > 2)
                && (textWithOutFixes.Length < 17 || bitmap.Height < 50))
            {
                string psm = Tesseract3DoOcrViaExe(bitmap, _languageId, "-psm 7"); // 7 = Treat the image as a single text line.
                if (textWithOutFixes != psm)
                {
                    if (textWithOutFixes.Trim().Length == 0)
                    {
                        textWithOutFixes = psm;
                    }
                    else if (psm.Length > textWithOutFixes.Length)
                    {
                        if (!psm.Contains("9") && textWithOutFixes.Contains("9") ||
                            !psm.Contains("6") && textWithOutFixes.Contains("6") ||
                            !psm.Contains("5") && textWithOutFixes.Contains("5") ||
                            !psm.Contains("3") && textWithOutFixes.Contains("3") ||
                            !psm.Contains("1") && textWithOutFixes.Contains("1") ||
                            !psm.Contains("$") && textWithOutFixes.Contains("$") ||
                            !psm.Contains("•") && textWithOutFixes.Contains("•") ||
                            !psm.Contains("Y") && textWithOutFixes.Contains("Y") ||
                            !psm.Contains("'") && textWithOutFixes.Contains("'") ||
                            !psm.Contains("€") && textWithOutFixes.Contains("€"))

                            textWithOutFixes = psm;
                    }
                    else if (psm.Length == textWithOutFixes.Length &&
                             (!psm.Contains("0") && textWithOutFixes.Contains("0") ||  // these chars are often mistaken
                              !psm.Contains("9") && textWithOutFixes.Contains("9") ||
                              !psm.Contains("8") && textWithOutFixes.Contains("8") ||
                              !psm.Contains("5") && textWithOutFixes.Contains("5") ||
                              !psm.Contains("3") && textWithOutFixes.Contains("3") ||
                              !psm.Contains("1") && textWithOutFixes.Contains("1") ||
                              !psm.Contains("$") && textWithOutFixes.Contains("$") ||
                              !psm.Contains("€") && textWithOutFixes.Contains("€") ||
                              !psm.Contains("•") && textWithOutFixes.Contains("•") ||
                              !psm.Contains("Y") && textWithOutFixes.Contains("Y") ||
                              !psm.Contains("'") && textWithOutFixes.Contains("'") ||
                              !psm.Contains("/") && textWithOutFixes.Contains("/") ||
                              !psm.Contains("(") && textWithOutFixes.Contains("(") ||
                              !psm.Contains(")") && textWithOutFixes.Contains(")") ||
                              !psm.Contains("_") && textWithOutFixes.Contains("_")))
                    {
                        textWithOutFixes = psm;
                    }
                    else if (psm.Length == textWithOutFixes.Length && psm.EndsWith(".") && !textWithOutFixes.EndsWith("."))
                    {
                        textWithOutFixes = psm;
                    }
                }
            }
            if (!checkBoxTesseractItalicsOn.Checked)
                textWithOutFixes = textWithOutFixes.Replace("<i>", string.Empty).Replace("</i>", string.Empty);

            // Sometimes Tesseract has problems with small fonts - it helps to make the image larger
            if (textWithOutFixes.Replace("<i>", string.Empty).Replace("</i>", string.Empty).Replace("@", string.Empty).Replace("%", string.Empty).Replace("|", string.Empty).Trim().Length < 3 ||
                Utilities.CountTagInText("\n", textWithOutFixes) > 2)
            {
                string rs = TesseractResizeAndRetry(bitmap);
                textWithOutFixes = rs;
                if (!checkBoxTesseractItalicsOn.Checked)
                    textWithOutFixes = textWithOutFixes.Replace("<i>", string.Empty).Replace("</i>", string.Empty);
            }

            // fix italics
            textWithOutFixes = FixItalics(textWithOutFixes);

            int numberOfWords = textWithOutFixes.ToString().Split((" " + Environment.NewLine).ToCharArray(), StringSplitOptions.RemoveEmptyEntries).Length;

            string line = textWithOutFixes.ToString().Trim();
            if (_ocrFixEngine.IsDictionaryLoaded)
            {
                if (checkBoxAutoFixCommonErrors.Checked)
                    line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
                int correctWords;
                int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords);
                int oldCorrectWords = correctWords;

                if (wordsNotFound > 0 || correctWords == 0)
                {
                    List<string> oldUnkownWords = new List<string>();
                    oldUnkownWords.AddRange(_ocrFixEngine.UnknownWordsFound);
                    _ocrFixEngine.UnknownWordsFound.Clear();

                    string newUnfixedText = TesseractResizeAndRetry(bitmap);
                    string newText = _ocrFixEngine.FixOcrErrors(newUnfixedText, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
                    int newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText, out correctWords);

                    if (wordsNotFound == 1 && newWordsNotFound == 1 && newUnfixedText.EndsWith("!!") && textWithOutFixes.EndsWith("u") && newText.Length > 1)
                    {
                        _ocrFixEngine.UnknownWordsFound.Clear();
                        newText = textWithOutFixes.Substring(0, textWithOutFixes.Length - 1) + "!!";
                        newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText, out correctWords);
                    }
                    else if ((!newText.Contains("9") || textWithOutFixes.Contains("9")) &&
                             (!newText.Replace("</i>", string.Empty).Contains("/") || textWithOutFixes.Replace("</i>", string.Empty).Contains("/")) &&
                             newUnfixedText.Trim().Length > 0 &&
                             newWordsNotFound < wordsNotFound || (newWordsNotFound == wordsNotFound && newText.EndsWith("!") && textWithOutFixes.EndsWith("l")))
                    {
                        wordsNotFound = newWordsNotFound;
                        if (textWithOutFixes.Length > 3 && textWithOutFixes.EndsWith("...") && !newText.EndsWith(".") && !newText.EndsWith(",") && !newText.EndsWith("!") &&
                            !newText.EndsWith("?") && !newText.EndsWith("</i>"))
                            newText = newText.TrimEnd() + "...";
                        else if (textWithOutFixes.Length > 0 && textWithOutFixes.EndsWith(".") && !newText.EndsWith(".") && !newText.EndsWith(",") && !newText.EndsWith("!") &&
                            !newText.EndsWith("?") && !newText.EndsWith("</i>"))
                            newText = newText.TrimEnd() + ".";
                        else if (textWithOutFixes.Length > 0 && textWithOutFixes.EndsWith("?") && !newText.EndsWith(".") && !newText.EndsWith(",") && !newText.EndsWith("!") &&
                            !newText.EndsWith("?") && !newText.EndsWith("</i>"))
                            newText = newText.TrimEnd() + "?";

                        textWithOutFixes = newUnfixedText;
                        line = FixItalics(newText);
                    }
                    else if (correctWords > oldCorrectWords + 1 || (correctWords > oldCorrectWords && !textWithOutFixes.Contains(" ")))
                    {
                        wordsNotFound = newWordsNotFound;
                        textWithOutFixes = newUnfixedText;
                        line = newText;
                    }
                    else
                    {
                        _ocrFixEngine.UnknownWordsFound.Clear();
                        _ocrFixEngine.UnknownWordsFound.AddRange(oldUnkownWords);
                    }
                }

                if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && textWithOutFixes.ToString().Replace("~", string.Empty).Trim().Length < 2)
                {
                    if (_bluRaySubtitles != null && !line.Contains("<i>"))
                    {
                        _ocrFixEngine.AutoGuessesUsed.Clear();
                        _ocrFixEngine.UnknownWordsFound.Clear();

                        // which is best - normal image or one color image?
                        var nbmp = new NikseBitmap(bitmap);
                        nbmp.MakeOneColor(Color.White);
                        Bitmap oneColorBitmap = nbmp.GetBitmap();
                        string oneColorText = Tesseract3DoOcrViaExe(oneColorBitmap, _languageId, "-psm 6"); // 6 = Assume a single uniform block of text.
                        oneColorBitmap.Dispose();
                        nbmp = null;

                        if (oneColorText.Length > 1 &&
                            !oneColorText.Contains("CD") &&
                            (!oneColorText.Contains("0") || line.Contains("0")) &&
                            (!oneColorText.Contains("2") || line.Contains("2")) &&
                            (!oneColorText.Contains("3") || line.Contains("4")) &&
                            (!oneColorText.Contains("5") || line.Contains("5")) &&
                            (!oneColorText.Contains("9") || line.Contains("9")) &&
                            (!oneColorText.Contains("•") || line.Contains("•")) &&
                            (!oneColorText.Contains(")") || line.Contains(")")) &&
                            Utilities.CountTagInText(oneColorText, "(") < 2 && Utilities.CountTagInText(oneColorText, ")") < 2 &&
                            Utilities.CountTagInText(oneColorText, Environment.NewLine) < 3)
                        {
                            int modiCorrectWords;
                            int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(oneColorText, out modiCorrectWords);
                            string modiTextOcrFixed = oneColorText;
                            if (checkBoxAutoFixCommonErrors.Checked)
                                modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(oneColorText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked);
                            int modiOcrCorrectedCorrectWords;
                            int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out modiOcrCorrectedCorrectWords);
                            if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound)
                            {
                                oneColorText = modiTextOcrFixed;
                                modiWordsNotFound = modiOcrCorrectedWordsNotFound;
                                modiCorrectWords = modiOcrCorrectedCorrectWords;
                            }

                            if (modiWordsNotFound < wordsNotFound || (textWithOutFixes.Length == 1 && modiWordsNotFound == 0))
                            {
                                line = FixItalics(oneColorText); // use one-color text
                                wordsNotFound = modiWordsNotFound;
                                correctWords = modiCorrectWords;
                                if (checkBoxAutoFixCommonErrors.Checked)
                                    line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
                            }
                            else if (wordsNotFound == modiWordsNotFound && oneColorText.EndsWith("!") && (line.EndsWith("l") || line.EndsWith("fl")))
                            {
                                line = FixItalics(oneColorText);
                                wordsNotFound = modiWordsNotFound;
                                correctWords = modiCorrectWords;
                                if (checkBoxAutoFixCommonErrors.Checked)
                                    line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
                            }
                        }
                    }
                }

                if (checkBoxTesseractItalicsOn.Checked)
                {
                    if (line.Contains("<i>") || wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && textWithOutFixes.ToString().Replace("~", string.Empty).Trim().Length < 2)
                    {
                        _ocrFixEngine.AutoGuessesUsed.Clear();
                        _ocrFixEngine.UnknownWordsFound.Clear();

                        // which is best - normal image or de-italic'ed? We find out here
                        var unItalicedBmp = UnItalic(bitmap, _unItalicFactor);
                        string unItalicText = Tesseract3DoOcrViaExe(unItalicedBmp, _languageId, "-psm 6"); // 6 = Assume a single uniform block of text.
                        unItalicedBmp.Dispose();

                        if (unItalicText.Length > 1)
                        {
                            int modiCorrectWords;
                            int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(unItalicText, out modiCorrectWords);
                            string modiTextOcrFixed = unItalicText;
                            if (checkBoxAutoFixCommonErrors.Checked)
                                modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(unItalicText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked);
                            int modiOcrCorrectedCorrectWords;
                            int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out modiOcrCorrectedCorrectWords);
                            if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound)
                            {
                                unItalicText = modiTextOcrFixed;
                                modiWordsNotFound = modiOcrCorrectedWordsNotFound;
                                modiCorrectWords = modiOcrCorrectedCorrectWords;
                            }

                            bool ok = modiWordsNotFound < wordsNotFound || (textWithOutFixes.Length == 1 && modiWordsNotFound == 0);

                            if (!ok)
                                ok = wordsNotFound == modiWordsNotFound && unItalicText.EndsWith("!") && (line.EndsWith("l") || line.EndsWith("fl"));

                            if (!ok)
                                ok = wordsNotFound == modiWordsNotFound && line.StartsWith("<i>") && line.EndsWith("</i>");

                            if (ok && Utilities.CountTagInText(unItalicText, "/") > Utilities.CountTagInText(line, "/") + 1)
                                ok = false;
                            if (ok && Utilities.CountTagInText(unItalicText, "\\") > Utilities.CountTagInText(line, "\\"))
                                ok = false;
                            if (ok && Utilities.CountTagInText(unItalicText, ")") > Utilities.CountTagInText(line, ")") + 1)
                                ok = false;
                            if (ok && Utilities.CountTagInText(unItalicText, "(") > Utilities.CountTagInText(line, "(") + 1)
                                ok = false;
                            if (ok && Utilities.CountTagInText(unItalicText, "$") > Utilities.CountTagInText(line, "$") + 1)
                                ok = false;
                            if (ok && Utilities.CountTagInText(unItalicText, "€") > Utilities.CountTagInText(line, "€") + 1)
                                ok = false;
                            if (ok && Utilities.CountTagInText(unItalicText, "•") > Utilities.CountTagInText(line, "•"))
                                ok = false;

                            if (ok)
                            {
                                wordsNotFound = modiWordsNotFound;
                                correctWords = modiCorrectWords;

                                line = line.Replace("<i>", string.Empty).Replace("</i>", string.Empty).Trim();

                                if (line.Length > 7 && unItalicText.Length > 7 && unItalicText.StartsWith("I ") &&
                                    line.StartsWith(unItalicText.Remove(0, 2).Substring(0, 4)))
                                    unItalicText = unItalicText.Remove(0, 2);

                                if (checkBoxTesseractMusicOn.Checked)
                                {
                                    if ((line.StartsWith("J' ") || line.StartsWith("J“ ") || line.StartsWith("J* ") || line.StartsWith("♪ ")) && unItalicText.Length > 3 && unItalicText.Replace("<i>", string.Empty).Replace("</i>", string.Empty).Substring(1, 2) == "' ")
                                    {
                                        unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart();
                                    }
                                    if ((line.StartsWith("J' ") || line.StartsWith("J“ ") || line.StartsWith("J* ") || line.StartsWith("♪ ")) && unItalicText.Length > 3 && unItalicText.Replace("<i>", string.Empty).Replace("</i>", string.Empty).Substring(1, 1) == " ")
                                    {
                                        bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>");
                                        unItalicText = Utilities.RemoveHtmlTags(unItalicText);
                                        unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart();
                                        if (ita)
                                            unItalicText = "<i>" + unItalicText + "</i>";
                                    }
                                    if ((line.StartsWith("J' ") || line.StartsWith("J“ ") || line.StartsWith("J* ") || line.StartsWith("♪ ")) && unItalicText.Length > 3 && unItalicText.Replace("<i>", string.Empty).Replace("</i>", string.Empty).Substring(2, 1) == " ")
                                    {
                                        bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>");
                                        unItalicText = Utilities.RemoveHtmlTags(unItalicText);
                                        unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart();
                                        if (ita)
                                            unItalicText = "<i>" + unItalicText + "</i>";
                                    }
                                    if (unItalicText.StartsWith("J'") && (line.StartsWith("♪") || textWithOutFixes.StartsWith("♪") || textWithOutFixes.StartsWith("<i>♪") || unItalicText.EndsWith("♪")))
                                    {
                                        bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>");
                                        unItalicText = Utilities.RemoveHtmlTags(unItalicText);
                                        unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart();
                                        if (ita)
                                            unItalicText = "<i>" + unItalicText + "</i>";
                                    }
                                    if ((line.StartsWith("J` ") || line.StartsWith("J“ ") || line.StartsWith("J' ") || line.StartsWith("J* ")) && unItalicText.StartsWith("S "))
                                    {
                                        bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>");
                                        unItalicText = Utilities.RemoveHtmlTags(unItalicText);
                                        unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart();
                                        if (ita)
                                            unItalicText = "<i>" + unItalicText + "</i>";
                                    }
                                    if ((line.StartsWith("J` ") || line.StartsWith("J“ ") || line.StartsWith("J' ") || line.StartsWith("J* ")) && unItalicText.StartsWith("<i>S</i> "))
                                    {
                                        bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>");
                                        unItalicText = Utilities.RemoveHtmlTags(unItalicText);
                                        unItalicText = "♪ " + unItalicText.Remove(0, 8).TrimStart();
                                        if (ita)
                                            unItalicText = "<i>" + unItalicText + "</i>";
                                    }
                                    if (unItalicText.StartsWith(";'") && (line.StartsWith("♪") || textWithOutFixes.StartsWith("♪") || textWithOutFixes.StartsWith("<i>♪") || unItalicText.EndsWith("♪")))
                                    {
                                        bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>");
                                        unItalicText = Utilities.RemoveHtmlTags(unItalicText);
                                        unItalicText = "♪ " + unItalicText.Remove(0, 2).TrimStart();
                                        if (ita)
                                            unItalicText = "<i>" + unItalicText + "</i>";
                                    }
                                    if (unItalicText.StartsWith(",{*") && (line.StartsWith("♪") || textWithOutFixes.StartsWith("♪") || textWithOutFixes.StartsWith("<i>♪") || unItalicText.EndsWith("♪")))
                                    {
                                        bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>");
                                        unItalicText = Utilities.RemoveHtmlTags(unItalicText);
                                        unItalicText = "♪ " + unItalicText.Remove(0, 3).TrimStart();
                                        if (ita)
                                            unItalicText = "<i>" + unItalicText + "</i>";
                                    }

                                    if (unItalicText.EndsWith("J'") && (line.EndsWith("♪") || textWithOutFixes.EndsWith("♪") || textWithOutFixes.EndsWith("♪</i>") || unItalicText.StartsWith("♪")))
                                    {
                                        bool ita = unItalicText.StartsWith("<i>") && unItalicText.EndsWith("</i>");
                                        unItalicText = Utilities.RemoveHtmlTags(unItalicText);
                                        unItalicText = unItalicText.Remove(unItalicText.Length - 3, 2).TrimEnd() + " ♪";
                                        if (ita)
                                            unItalicText = "<i>" + unItalicText + "</i>";
                                    }
                                }

                                if (unItalicText.StartsWith("[") && !line.StartsWith("["))
                                {
                                    unItalicText = unItalicText.Remove(0, 1);
                                    if (unItalicText.EndsWith("]"))
                                        unItalicText = unItalicText.TrimEnd(']');
                                }
                                if (unItalicText.StartsWith("{") && !line.StartsWith("{"))
                                {
                                    unItalicText = unItalicText.Remove(0, 1);
                                    if (unItalicText.EndsWith("}"))
                                        unItalicText = unItalicText.TrimEnd('}');
                                }
                                if (unItalicText.EndsWith("}") && !line.EndsWith("}"))
                                    unItalicText = unItalicText.TrimEnd('}');

                                if (line.EndsWith("...") && unItalicText.EndsWith("”!"))
                                    unItalicText = unItalicText.TrimEnd('!').TrimEnd('”') + ".";
                                if (line.EndsWith("...") && unItalicText.EndsWith("\"!"))
                                    unItalicText = unItalicText.TrimEnd('!').TrimEnd('"') + ".";

                                if (line.EndsWith(".") && !unItalicText.EndsWith(".") && !unItalicText.EndsWith(".</i>"))
                                {
                                    string post = string.Empty;
                                    if (unItalicText.EndsWith("</i>"))
                                    {
                                        post = "</i>";
                                        unItalicText = unItalicText.Remove(unItalicText.Length - 4);
                                    }
                                    if (unItalicText.EndsWith("'") && !line.EndsWith("'."))
                                        unItalicText = unItalicText.TrimEnd('\'');
                                    unItalicText += "." + post;
                                }
                                if (unItalicText.EndsWith(".") && !unItalicText.EndsWith("...") && !unItalicText.EndsWith("...</i>") && line.EndsWith("..."))
                                {
                                    string post = string.Empty;
                                    if (unItalicText.EndsWith("</i>"))
                                    {
                                        post = "</i>";
                                        unItalicText = unItalicText.Remove(unItalicText.Length - 4);
                                    }
                                    unItalicText += ".." + post;
                                }
                                if (unItalicText.EndsWith("..") && !unItalicText.EndsWith("...") && !unItalicText.EndsWith("...</i>") && line.EndsWith("..."))
                                {
                                    string post = string.Empty;
                                    if (unItalicText.EndsWith("</i>"))
                                    {
                                        post = "</i>";
                                        unItalicText = unItalicText.Remove(unItalicText.Length - 4);
                                    }
                                    unItalicText += "." + post;
                                }

                                if (line.EndsWith("!") && !unItalicText.EndsWith("!") && !unItalicText.EndsWith("!</i>"))
                                {
                                    if (unItalicText.EndsWith("!'"))
                                    {
                                        unItalicText = unItalicText.TrimEnd('\'');
                                    }
                                    else
                                    {
                                        if (unItalicText.EndsWith("l</i>") && _ocrFixEngine != null)
                                        {
                                            string w = unItalicText.Substring(0, unItalicText.Length - 4);
                                            int wIdx = w.Length - 1;
                                            while (wIdx >= 0 && !(" .,!?<>:;'-$@£()[]<>/\"".Contains(w[wIdx].ToString())))
                                            {
                                                wIdx--;
                                            }
                                            if (wIdx + 1 < w.Length && unItalicText.Length > 5)
                                            {
                                                w = w.Substring(wIdx + 1);
                                                if (!_ocrFixEngine.DoSpell(w))
                                                    unItalicText = unItalicText.Remove(unItalicText.Length - 5, 1);
                                            }
                                            unItalicText = unItalicText.Insert(unItalicText.Length - 4, "!");
                                        }
                                        else if (unItalicText.EndsWith("l") && _ocrFixEngine != null)
                                        {
                                            string w = unItalicText;
                                            int wIdx = w.Length - 1;
                                            while (wIdx >= 0 && !(" .,!?<>:;'-$@£()[]<>/\"".Contains(w[wIdx].ToString())))
                                            {
                                                wIdx--;
                                            }
                                            if (wIdx + 1 < w.Length && unItalicText.Length > 5)
                                            {
                                                w = w.Substring(wIdx + 1);
                                                if (!_ocrFixEngine.DoSpell(w))
                                                    unItalicText = unItalicText.Remove(unItalicText.Length - 1, 1);
                                            }
                                            unItalicText += "!";
                                        }
                                        else
                                        {
                                            unItalicText += "!";
                                        }

                                    }
                                }
                                if (line.EndsWith("?") && !unItalicText.EndsWith("?") && !unItalicText.EndsWith("?</i>"))
                                {
                                    if (unItalicText.EndsWith("?'"))
                                        unItalicText = unItalicText.TrimEnd('\'');
                                    else
                                        unItalicText += "?";
                                }

                                line = unItalicText.Replace("<i>", string.Empty).Replace("</i>", string.Empty);
                                if (checkBoxAutoFixCommonErrors.Checked)
                                {
                                    if (line.Contains("'.") && !textWithOutFixes.Contains("'.") && textWithOutFixes.Contains(":") && !line.EndsWith("'.") && Configuration.Settings.Tools.OcrFixUseHardcodedRules)
                                    {
                                        line = line.Replace("'.", ":");
                                    }
                                    line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
                                }
                                line = "<i>" + line + "</i>";
                            }
                            else
                            {
                                unItalicText = unItalicText.Replace("</i>", string.Empty);
                                if (line.EndsWith("</i>") && unItalicText.EndsWith("."))
                                {
                                    line = line.Remove(line.Length - 4, 4);
                                    if (line.EndsWith("-"))
                                        line = line.TrimEnd('-') + ".";
                                    if (Utilities.AllLetters.Contains(line.Substring(line.Length - 1)))
                                        line += ".";
                                    line += "</i>";
                                }
                            }
                        }
                    }
                }

                if (checkBoxTesseractMusicOn.Checked)
                {
                    if (line == "[J'J'J~]" || line == "[J'J'J']")
                        line = "[ ♪ ♪ ♪ ]";

                    line = line.Replace(" J' ", " ♪ ");

                    if (line.StartsWith("J'"))
                    {
                        line = "♪ " + line.Remove(0, 2).TrimStart();
                    }
                    if (line.StartsWith("<i>J'"))
                    {
                        line = "<i>♪ " + line.Remove(0, 5).TrimStart();
                    }
                    if (line.StartsWith("[J'"))
                    {
                        line = "[♪ " + line.Remove(0, 3).TrimStart();
                    }
                    if (line.StartsWith("<i>[J'"))
                    {
                        line = "<i>[♪ " + line.Remove(0, 6).TrimStart();
                    }
                    if (line.EndsWith("J'"))
                    {
                        line = line.Remove(line.Length - 2, 2).TrimEnd() + " ♪";
                    }
                    if (line.EndsWith("J'</i>"))
                    {
                        line = line.Remove(line.Length - 6, 6).TrimEnd() + " ♪</i>";
                    }
                    if (line.Contains(Environment.NewLine + "J'"))
                    {
                        line = line.Replace(Environment.NewLine + "J'", Environment.NewLine + "♪ ");
                        line = line.Replace("  ", " ");
                    }
                    if (line.Contains("J'" + Environment.NewLine))
                    {
                        line = line.Replace("J'" + Environment.NewLine, " ♪" + Environment.NewLine);
                        line = line.Replace("  ", " ");
                    }
                }

                if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && textWithOutFixes.ToString().Replace("~", string.Empty).Trim().Length < 2)
                {
                    _ocrFixEngine.AutoGuessesUsed.Clear();
                    _ocrFixEngine.UnknownWordsFound.Clear();

                    if (_modiEnabled && checkBoxUseModiInTesseractForUnknownWords.Checked)
                    {
                        // which is best - modi or tesseract - we find out here
                        string modiText = CallModi(index);

                        if (modiText.Length == 0)
                            modiText = CallModi(index); // retry... strange MODI
                        if (modiText.Length == 0)
                            modiText = CallModi(index); // retry... strange MODI

                        if (modiText.Length > 1 &&
                            !modiText.Contains("CD") &&
                            (!modiText.Contains("0") || line.Contains("0")) &&
                            (!modiText.Contains("2") || line.Contains("2")) &&
                            (!modiText.Contains("3") || line.Contains("4")) &&
                            (!modiText.Contains("5") || line.Contains("5")) &&
                            (!modiText.Contains("9") || line.Contains("9")) &&
                            (!modiText.Contains("•") || line.Contains("•")) &&
                            (!modiText.Contains(")") || line.Contains(")")) &&
                            Utilities.CountTagInText(modiText, "(") < 2 && Utilities.CountTagInText(modiText, ")") < 2 &&
                            Utilities.CountTagInText(modiText, Environment.NewLine) < 3)
                        {
                            int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiText, out correctWords);
                            //if (modiWordsNotFound > 0)
                            {
                                string modiTextOcrFixed = modiText;
                                if (checkBoxAutoFixCommonErrors.Checked)
                                    modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(modiText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked);
                                int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out correctWords);
                                if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound)
                                    modiText = modiTextOcrFixed;
                            }

                            if (modiWordsNotFound < wordsNotFound || (textWithOutFixes.Length == 1 && modiWordsNotFound == 0))
                                line = modiText; // use the modi ocr'ed text
                            else if (wordsNotFound == modiWordsNotFound && modiText.EndsWith("!") && (line.EndsWith("l") || line.EndsWith("fl")))
                                line = modiText;
                        }

                        // take the best option - before ocr fixing, which we do again to save suggestions and prompt for user input
                        line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, checkBoxGuessUnknownWords.Checked);
                    }
                    else
                    { // fix some error manually (modi not available)
                        line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, checkBoxGuessUnknownWords.Checked);
                    }
                }

                if (_ocrFixEngine.Abort)
                {
                    ButtonStopClick(null, null);
                    _ocrFixEngine.Abort = false;
                    return string.Empty;
                }

                //check tesseract... find some otherway to do this...
                //string tmp = Utilities.RemoveHtmlTags(line).Trim();
                //if (!tmp.Trim().EndsWith("..."))
                //{
                //    tmp = tmp.TrimEnd('.').TrimEnd();
                //    if (tmp.Length > 2 && Utilities.LowercaseLetters.Contains(tmp.Substring(tmp.Length - 1, 1)))
                //    {
                //        if (_nocrChars == null)
                //            _nocrChars = LoadNOcrForTesseract("Nikse.SubtitleEdit.Resources.nOCR_TesseractHelper.xml.zip");
                //        string text = Utilities.RemoveHtmlTags(NocrFastCheck(bitmap).TrimEnd());
                //        string post = string.Empty;
                //        if (line.EndsWith("</i>"))
                //        {
                //            post = "</i>";
                //            line = line.Remove(line.Length - 4, 4).Trim();
                //        }
                //        if (text.EndsWith("."))
                //        {
                //            line = line.TrimEnd('.').Trim();
                //            while (text.EndsWith(".") || text.EndsWith(" "))
                //            {
                //                line += text.Substring(text.Length - 1).Trim();
                //                text = text.Remove(text.Length - 1, 1);
                //            }
                //        }
                //        else if (text.EndsWith("l") && text.EndsWith("!") && !text.EndsWith("l!"))
                //        {
                //            line = line.Remove(line.Length - 1, 1) + "!";
                //        }
                //        line += post;
                //    }
                //}

                // Log used word guesses (via word replace list)
                foreach (string guess in _ocrFixEngine.AutoGuessesUsed)
                    listBoxLogSuggestions.Items.Add(guess);
                _ocrFixEngine.AutoGuessesUsed.Clear();

                // Log unkown words guess (found via spelling dictionaries)
                LogUnknownWords();

                if (wordsNotFound >= 3)
                    subtitleListView1.SetBackgroundColor(index, Color.Red);
                if (wordsNotFound == 2)
                    subtitleListView1.SetBackgroundColor(index, Color.Orange);
                else if (wordsNotFound == 1 || line.Length == 1 || line.Contains("_") || HasSingleLetters(line))
                    subtitleListView1.SetBackgroundColor(index, Color.Yellow);
                else if (line.Trim().Length == 0)
                    subtitleListView1.SetBackgroundColor(index, Color.Orange);
                else
                    subtitleListView1.SetBackgroundColor(index, Color.LightGreen);
            }
            else
            { // no dictionary :(
                if (checkBoxAutoFixCommonErrors.Checked)
                    line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);

                if (badWords >= numberOfWords)
                    subtitleListView1.SetBackgroundColor(index, Color.Red);
                else if (badWords >= numberOfWords / 2)
                    subtitleListView1.SetBackgroundColor(index, Color.Orange);
                else if (badWords > 0 || line.Contains("_") || HasSingleLetters(line))
                    subtitleListView1.SetBackgroundColor(index, Color.Yellow);
                else if (line.Replace("<i>", string.Empty).Replace("</i>", string.Empty).Trim().Length == 0)
                    subtitleListView1.SetBackgroundColor(index, Color.Orange);
                else
                    subtitleListView1.SetBackgroundColor(index, Color.LightGreen);
            }

            if (textWithOutFixes.ToString().Trim() != line.Trim())
            {
                _tesseractOcrAutoFixes++;
                labelFixesMade.Text = string.Format(" - {0}", _tesseractOcrAutoFixes);
                LogOcrFix(index, textWithOutFixes.ToString(), line);
            }

            if (_vobSubMergedPackist != null)
                bitmap.Dispose();

            return line;
        }