public string PostProcess(string text) { if (text.Trim().Length == 0) { return(text); } // correct common errors caused by OCR text = TextUtilities.CorrectOCRErrors(text); // substitute Vietnamese letters frequently misrecognized by Tesseract 2.03 StringBuilder strB = new StringBuilder(text); strB.Replace("êĩ-", "ết") .Replace("ug", "ng") .Replace("uh", "nh") .Replace("rn", "m") .Replace("iii", "m") .Replace("II", "u") .Replace("ôh", "ốn") .Replace("âỳ", "ấy") .Replace("u1I", "ưn") .Replace("q1I", "qu") .Replace("tmg", "úng") .Replace("tm", "trư") .Replace("Tm", "Trư") .Replace("êf", "ết") .Replace("rg", "ng") .Replace("êh", "ến") .Replace("fâ", "rầ") ; // correct letter cases text = TextUtilities.CorrectLetterCases(strB.ToString()); // add hook marks // .ReplaceAll("(?i)(?<![q])(u)(?=[ơờởỡớợ]\\p{L})", "ư") // .Replace("ưon", "ươn") // .Replace("ưoi", "ươi"); string nfdText = text.Normalize(NormalizationForm.FormD); nfdText = Regex.Replace( Regex.Replace( Regex.Replace( Regex.Replace( Regex.Replace( Regex.Replace(nfdText, "(?i)(?<![q])(u)(?=o\u031B" + TONE + "\\p{L})", "$1\u031B"), // uo+n to u+o+n "(?i)(?<=u\u031B)(o)(?=" + TONE + "\\p{L})", "$1\u031B"), // u+on to u+o+n "(?i)(i)" + TONE + "(?=[eioy])", "$1"), // remove mark on i followed by certain vowels // It seems to be a bug with .NET: it should be \\b, not \\B, // unless combining diacritical characters are not considered as words by .NET. "(?i)(?<=[^q]" + VOWEL + "\\p{IsCombiningDiacriticalMarks}{0,2})(i)" + TONE + "\\B", "$1"), // remove mark on i preceeded by vowels w/ or w/o diacritics "(?i)(?<=[aeo]\u0302)\u2019", "\u0301"), // ^right-single-quote to ^acute "(?i)\u2018([aeo]\u0302)(?!\\p{IsCombiningDiacriticalMarks})", "$1\u0300") // left-single-quote+a^ to a^grave ; return(nfdText.Normalize()); }
public string PostProcess(string text) { if (text.Trim().Length == 0) { return(text); } // correct common errors caused by OCR text = TextUtilities.CorrectOCRErrors(text); // substitute Vietnamese letters frequently misrecognized by Tesseract 2.03 StringBuilder strB = new StringBuilder(text); strB.Replace("êĩ-", "ết") .Replace("ug", "ng") .Replace("uh", "nh") .Replace("rn", "m") .Replace("iii", "m") .Replace("ll", "u") .Replace("II", "u") .Replace("ôh", "ốn") .Replace("âỳ", "ấy") .Replace("u1I", "ưn") .Replace("q1I", "qu") .Replace("tmg", "úng") .Replace("tm", "trư") .Replace("Tm", "Trư") .Replace("êf", "ết") .Replace("rg", "ng") .Replace("êh", "ến") .Replace("‘â", "ầ") .Replace("fâ", "rầ") ; // correct letter cases text = TextUtilities.CorrectLetterCases(strB.ToString()); // add hook marks // .ReplaceAll("(?<![qQ])(u)(?=[ơờởỡớợ]\\p{L})", "ư") // .ReplaceAll("(?<![qQ])(U)(?=[ƠỜỞỠỚỢ]\\p{L})", "Ư") // .Replace("ưon", "ươn") // .Replace("ưoi", "ươi"); string nfdText = text.Normalize(NormalizationForm.FormD); nfdText = Regex.Replace( Regex.Replace( Regex.Replace( Regex.Replace(nfdText, "(?i)(?<![qQ])(u)(?=o\u031B" + TONE + "\\p{L})", "$1\u031B"), // uo+n to u+o+n "(?i)(?<=u\u031B)(o)(?=" + TONE + "\\p{L})", "$1\u031B"), // u+on to u+o+n "(?i)(i)" + TONE + "(?=[eioy])", "$1"), // remove mark on i followed by certain vowels "(?i)(?<=" + VOWEL + DOT_BELOW + TONE + MARK + ")(i)" + TONE + "\\b", "$1") // // remove mark on i preceeded by vowels ; return(nfdText.Normalize()); }
public string PostProcess(string text) { if (text.Trim().Length == 0) { return(text); } // correct common errors caused by OCR text = TextUtilities.CorrectOCRErrors(text); // correct letter cases text = TextUtilities.CorrectLetterCases(text); return(text); }
public static string PostProcess(string text, string langCode, string dangAmbigsPath, bool dangAmbigsOn) { if (text.Trim().Length == 0) { return(text); } // correct using external x.DangAmbigs.txt file first, if enabled if (dangAmbigsOn) { StringBuilder strB = new StringBuilder(text); // replace text based on entries read from an x.DangAmbigs.txt file Dictionary <string, string> replaceRules = TextUtilities.LoadMap(Path.Combine(dangAmbigsPath, langCode + ".DangAmbigs.txt")); if (replaceRules.Count == 0 && langCode.Length > 3) { replaceRules = TextUtilities.LoadMap(Path.Combine(dangAmbigsPath, langCode.Substring(0, 3) + ".DangAmbigs.txt")); // fall back on base } if (replaceRules.Count == 0) { throw new NotSupportedException(langCode); } Dictionary <string, string> .KeyCollection.Enumerator enumer = replaceRules.Keys.GetEnumerator(); while (enumer.MoveNext()) { string key = enumer.Current; string value = replaceRules[key]; strB = strB.Replace(key, value); } text = strB.ToString(); } // postprocessor text = PostProcess(text, langCode); // correct common errors caused by OCR text = TextUtilities.CorrectOCRErrors(text); // correct letter cases return(TextUtilities.CorrectLetterCases(text)); }