CorrectOCRErrors() 공개 정적인 메소드

Corrects common Tesseract OCR errors.
public static CorrectOCRErrors ( String input ) : string
input String
리턴 string
예제 #1
0
파일: ViePP.cs 프로젝트: dominhhai/VietOCR
        public string PostProcess(string text)
        {
            if (text.Trim().Length == 0)
            {
                return(text);
            }

            // correct common errors caused by OCR
            text = TextUtilities.CorrectOCRErrors(text);

            // substitute Vietnamese letters frequently misrecognized by Tesseract 2.03
            StringBuilder strB = new StringBuilder(text);

            strB.Replace("êĩ-", "ết")
            .Replace("ug", "ng")
            .Replace("uh", "nh")
            .Replace("rn", "m")
            .Replace("iii", "m")
            .Replace("II", "u")
            .Replace("ôh", "ốn")
            .Replace("âỳ", "ấy")
            .Replace("u1I", "ưn")
            .Replace("q1I", "qu")
            .Replace("tmg", "úng")
            .Replace("tm", "trư")
            .Replace("Tm", "Trư")
            .Replace("êf", "ết")
            .Replace("rg", "ng")
            .Replace("êh", "ến")
            .Replace("fâ", "rầ")
            ;

            // correct letter cases
            text = TextUtilities.CorrectLetterCases(strB.ToString());

            // add hook marks
            //                    .ReplaceAll("(?i)(?<![q])(u)(?=[ơờởỡớợ]\\p{L})", "ư")
            //                    .Replace("ưon", "ươn")
            //                    .Replace("ưoi", "ươi");

            string nfdText = text.Normalize(NormalizationForm.FormD);

            nfdText = Regex.Replace(
                Regex.Replace(
                    Regex.Replace(
                        Regex.Replace(
                            Regex.Replace(
                                Regex.Replace(nfdText,
                                              "(?i)(?<![q])(u)(?=o\u031B" + TONE + "\\p{L})", "$1\u031B"), // uo+n to u+o+n
                                "(?i)(?<=u\u031B)(o)(?=" + TONE + "\\p{L})", "$1\u031B"),                  // u+on to u+o+n
                            "(?i)(i)" + TONE + "(?=[eioy])", "$1"),                                        // remove mark on i followed by certain vowels
                        // It seems to be a bug with .NET: it should be \\b, not \\B,
                        // unless combining diacritical characters are not considered as words by .NET.
                        "(?i)(?<=[^q]" + VOWEL + "\\p{IsCombiningDiacriticalMarks}{0,2})(i)" + TONE + "\\B", "$1"), // remove mark on i preceeded by vowels w/ or w/o diacritics
                    "(?i)(?<=[aeo]\u0302)\u2019", "\u0301"),                                                        // ^right-single-quote to ^acute
                "(?i)\u2018([aeo]\u0302)(?!\\p{IsCombiningDiacriticalMarks})", "$1\u0300")                          // left-single-quote+a^ to a^grave
            ;

            return(nfdText.Normalize());
        }
예제 #2
0
파일: ViePP.cs 프로젝트: dominhhai/VietOCR
        public string PostProcess(string text)
        {
            if (text.Trim().Length == 0)
            {
                return(text);
            }

            // correct common errors caused by OCR
            text = TextUtilities.CorrectOCRErrors(text);

            // substitute Vietnamese letters frequently misrecognized by Tesseract 2.03
            StringBuilder strB = new StringBuilder(text);

            strB.Replace("êĩ-", "ết")
            .Replace("ug", "ng")
            .Replace("uh", "nh")
            .Replace("rn", "m")
            .Replace("iii", "m")
            .Replace("ll", "u")
            .Replace("II", "u")
            .Replace("ôh", "ốn")
            .Replace("âỳ", "ấy")
            .Replace("u1I", "ưn")
            .Replace("q1I", "qu")
            .Replace("tmg", "úng")
            .Replace("tm", "trư")
            .Replace("Tm", "Trư")
            .Replace("êf", "ết")
            .Replace("rg", "ng")
            .Replace("êh", "ến")
            .Replace("‘â", "ầ")
            .Replace("fâ", "rầ")
            ;

            // correct letter cases
            text = TextUtilities.CorrectLetterCases(strB.ToString());

            // add hook marks
            //                    .ReplaceAll("(?<![qQ])(u)(?=[ơờởỡớợ]\\p{L})", "ư")
            //                    .ReplaceAll("(?<![qQ])(U)(?=[ƠỜỞỠỚỢ]\\p{L})", "Ư")
            //                    .Replace("ưon", "ươn")
            //                    .Replace("ưoi", "ươi");

            string nfdText = text.Normalize(NormalizationForm.FormD);

            nfdText = Regex.Replace(
                Regex.Replace(
                    Regex.Replace(
                        Regex.Replace(nfdText,
                                      "(?i)(?<![qQ])(u)(?=o\u031B" + TONE + "\\p{L})", "$1\u031B"), // uo+n to u+o+n
                        "(?i)(?<=u\u031B)(o)(?=" + TONE + "\\p{L})", "$1\u031B"),                   // u+on to u+o+n
                    "(?i)(i)" + TONE + "(?=[eioy])", "$1"),                                         // remove mark on i followed by certain vowels
                "(?i)(?<=" + VOWEL + DOT_BELOW + TONE + MARK + ")(i)" + TONE + "\\b", "$1")         // // remove mark on i preceeded by vowels
            ;

            return(nfdText.Normalize());
        }
예제 #3
0
        public string PostProcess(string text)
        {
            if (text.Trim().Length == 0)
            {
                return(text);
            }

            // correct common errors caused by OCR
            text = TextUtilities.CorrectOCRErrors(text);

            // correct letter cases
            text = TextUtilities.CorrectLetterCases(text);

            return(text);
        }
예제 #4
0
        public static string PostProcess(string text, string langCode, string dangAmbigsPath, bool dangAmbigsOn)
        {
            if (text.Trim().Length == 0)
            {
                return(text);
            }

            // correct using external x.DangAmbigs.txt file first, if enabled
            if (dangAmbigsOn)
            {
                StringBuilder strB = new StringBuilder(text);

                // replace text based on entries read from an x.DangAmbigs.txt file
                Dictionary <string, string> replaceRules = TextUtilities.LoadMap(Path.Combine(dangAmbigsPath, langCode + ".DangAmbigs.txt"));
                if (replaceRules.Count == 0 && langCode.Length > 3)
                {
                    replaceRules = TextUtilities.LoadMap(Path.Combine(dangAmbigsPath, langCode.Substring(0, 3) + ".DangAmbigs.txt")); // fall back on base
                }

                if (replaceRules.Count == 0)
                {
                    throw new NotSupportedException(langCode);
                }

                Dictionary <string, string> .KeyCollection.Enumerator enumer = replaceRules.Keys.GetEnumerator();

                while (enumer.MoveNext())
                {
                    string key   = enumer.Current;
                    string value = replaceRules[key];
                    strB = strB.Replace(key, value);
                }
                text = strB.ToString();
            }

            // postprocessor
            text = PostProcess(text, langCode);

            // correct common errors caused by OCR
            text = TextUtilities.CorrectOCRErrors(text);

            // correct letter cases
            return(TextUtilities.CorrectLetterCases(text));
        }