C# (CSharp) VietOCR.NET.Postprocessing TextUtilities.CorrectOCRErrors 예제들

프로그래밍 언어: C# (CSharp)

네임스페이스/패키지 이름: VietOCR.NET.Postprocessing

클래스/타입: TextUtilities

메소드/함수: CorrectOCRErrors

hotexamples.com에서의 예제들: 4

C# (CSharp) VietOCR.NET.Postprocessing TextUtilities.CorrectOCRErrors - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 C# (CSharp)의 VietOCR.NET.Postprocessing.TextUtilities.CorrectOCRErrors에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

CorrectLetterCases(5)

CorrectOCRErrors(4)

LoadMap(3)

CorrectOCRErrors() 공개 정적인 메소드

Corrects common Tesseract OCR errors.

public static CorrectOCRErrors ( String input ) : string
input	String
리턴	string

TextUtilities 1 문서

예제 #1

파일 보기

파일: ViePP.cs 프로젝트: dominhhai/VietOCR

        public string PostProcess(string text)
        {
            if (text.Trim().Length == 0)
            {
                return(text);
            }

            // correct common errors caused by OCR
            text = TextUtilities.CorrectOCRErrors(text);

            // substitute Vietnamese letters frequently misrecognized by Tesseract 2.03
            StringBuilder strB = new StringBuilder(text);

            strB.Replace("êĩ-", "ết")
            .Replace("ug", "ng")
            .Replace("uh", "nh")
            .Replace("rn", "m")
            .Replace("iii", "m")
            .Replace("II", "u")
            .Replace("ôh", "ốn")
            .Replace("âỳ", "ấy")
            .Replace("u1I", "ưn")
            .Replace("q1I", "qu")
            .Replace("tmg", "úng")
            .Replace("tm", "trư")
            .Replace("Tm", "Trư")
            .Replace("êf", "ết")
            .Replace("rg", "ng")
            .Replace("êh", "ến")
            .Replace("fâ", "rầ")
            ;

            // correct letter cases
            text = TextUtilities.CorrectLetterCases(strB.ToString());

            // add hook marks
            //                    .ReplaceAll("(?i)(?<![q])(u)(?=[ơờởỡớợ]\\p{L})", "ư")
            //                    .Replace("ưon", "ươn")
            //                    .Replace("ưoi", "ươi");

            string nfdText = text.Normalize(NormalizationForm.FormD);

            nfdText = Regex.Replace(
                Regex.Replace(
                    Regex.Replace(
                        Regex.Replace(
                            Regex.Replace(
                                Regex.Replace(nfdText,
                                              "(?i)(?<![q])(u)(?=o\u031B" + TONE + "\\p{L})", "$1\u031B"), // uo+n to u+o+n
                                "(?i)(?<=u\u031B)(o)(?=" + TONE + "\\p{L})", "$1\u031B"),                  // u+on to u+o+n
                            "(?i)(i)" + TONE + "(?=[eioy])", "$1"),                                        // remove mark on i followed by certain vowels
                        // It seems to be a bug with .NET: it should be \\b, not \\B,
                        // unless combining diacritical characters are not considered as words by .NET.
                        "(?i)(?<=[^q]" + VOWEL + "\\p{IsCombiningDiacriticalMarks}{0,2})(i)" + TONE + "\\B", "$1"), // remove mark on i preceeded by vowels w/ or w/o diacritics
                    "(?i)(?<=[aeo]\u0302)\u2019", "\u0301"),                                                        // ^right-single-quote to ^acute
                "(?i)\u2018([aeo]\u0302)(?!\\p{IsCombiningDiacriticalMarks})", "$1\u0300")                          // left-single-quote+a^ to a^grave
            ;

            return(nfdText.Normalize());
        }

예제 #2

파일 보기

파일: ViePP.cs 프로젝트: dominhhai/VietOCR

        public string PostProcess(string text)
        {
            if (text.Trim().Length == 0)
            {
                return(text);
            }

            // correct common errors caused by OCR
            text = TextUtilities.CorrectOCRErrors(text);

            // substitute Vietnamese letters frequently misrecognized by Tesseract 2.03
            StringBuilder strB = new StringBuilder(text);

            strB.Replace("êĩ-", "ết")
            .Replace("ug", "ng")
            .Replace("uh", "nh")
            .Replace("rn", "m")
            .Replace("iii", "m")
            .Replace("ll", "u")
            .Replace("II", "u")
            .Replace("ôh", "ốn")
            .Replace("âỳ", "ấy")
            .Replace("u1I", "ưn")
            .Replace("q1I", "qu")
            .Replace("tmg", "úng")
            .Replace("tm", "trư")
            .Replace("Tm", "Trư")
            .Replace("êf", "ết")
            .Replace("rg", "ng")
            .Replace("êh", "ến")
            .Replace("‘â", "ầ")
            .Replace("fâ", "rầ")
            ;

            // correct letter cases
            text = TextUtilities.CorrectLetterCases(strB.ToString());

            // add hook marks
            //                    .ReplaceAll("(?<![qQ])(u)(?=[ơờởỡớợ]\\p{L})", "ư")
            //                    .ReplaceAll("(?<![qQ])(U)(?=[ƠỜỞỠỚỢ]\\p{L})", "Ư")
            //                    .Replace("ưon", "ươn")
            //                    .Replace("ưoi", "ươi");

            string nfdText = text.Normalize(NormalizationForm.FormD);

            nfdText = Regex.Replace(
                Regex.Replace(
                    Regex.Replace(
                        Regex.Replace(nfdText,
                                      "(?i)(?<![qQ])(u)(?=o\u031B" + TONE + "\\p{L})", "$1\u031B"), // uo+n to u+o+n
                        "(?i)(?<=u\u031B)(o)(?=" + TONE + "\\p{L})", "$1\u031B"),                   // u+on to u+o+n
                    "(?i)(i)" + TONE + "(?=[eioy])", "$1"),                                         // remove mark on i followed by certain vowels
                "(?i)(?<=" + VOWEL + DOT_BELOW + TONE + MARK + ")(i)" + TONE + "\\b", "$1")         // // remove mark on i preceeded by vowels
            ;

            return(nfdText.Normalize());
        }

예제 #3

파일 보기

        public string PostProcess(string text)
        {
            if (text.Trim().Length == 0)
            {
                return(text);
            }

            // correct common errors caused by OCR
            text = TextUtilities.CorrectOCRErrors(text);

            // correct letter cases
            text = TextUtilities.CorrectLetterCases(text);

            return(text);
        }

예제 #4

파일 보기

        public static string PostProcess(string text, string langCode, string dangAmbigsPath, bool dangAmbigsOn)
        {
            if (text.Trim().Length == 0)
            {
                return(text);
            }

            // correct using external x.DangAmbigs.txt file first, if enabled
            if (dangAmbigsOn)
            {
                StringBuilder strB = new StringBuilder(text);

                // replace text based on entries read from an x.DangAmbigs.txt file
                Dictionary <string, string> replaceRules = TextUtilities.LoadMap(Path.Combine(dangAmbigsPath, langCode + ".DangAmbigs.txt"));
                if (replaceRules.Count == 0 && langCode.Length > 3)
                {
                    replaceRules = TextUtilities.LoadMap(Path.Combine(dangAmbigsPath, langCode.Substring(0, 3) + ".DangAmbigs.txt")); // fall back on base
                }

                if (replaceRules.Count == 0)
                {
                    throw new NotSupportedException(langCode);
                }

                Dictionary <string, string> .KeyCollection.Enumerator enumer = replaceRules.Keys.GetEnumerator();

                while (enumer.MoveNext())
                {
                    string key   = enumer.Current;
                    string value = replaceRules[key];
                    strB = strB.Replace(key, value);
                }
                text = strB.ToString();
            }

            // postprocessor
            text = PostProcess(text, langCode);

            // correct common errors caused by OCR
            text = TextUtilities.CorrectOCRErrors(text);

            // correct letter cases
            return(TextUtilities.CorrectLetterCases(text));
        }