/// ------------------------------------------------------------------------------------ /// <summary> /// Determines whether the specified character is a mark (either as defined by the /// character property engine or as overridden by the language definition. /// </summary> /// <param name="chr">The character.</param> /// <param name="langDef">The language definition (can be null).</param> /// <param name="cpe">The character property engine.</param> /// ------------------------------------------------------------------------------------ private static bool IsMark(char chr, LanguageDefinition langDef, ILgCharacterPropertyEngine cpe) { string category = (langDef == null) ? null : langDef.GetOverrideCharCategory(chr); return ((category == null && cpe.get_IsMark(chr)) || (category != null && category[0] == 'M')); }
/// ------------------------------------------------------------------------------------ /// <summary> /// Update the characters in the Private Use Area. /// </summary> /// ------------------------------------------------------------------------------------ public static void UpdatePUACollection(LanguageDefinition langDef, List<string> chars) { List<PUACharacter> customPuaCharacters = GetDefinedCustomPUACharsFromICU(); // Remove any PUA characters that are no longer in the character list. If there // are any new PUA characters in the valid characters list, they // will be added to this collection in IsCodePointDefined(). int codePoint; string data; for (int i = langDef.PuaDefinitionCount - 1; i >= 0; i--) { langDef.GetPuaDefinition(i, out codePoint, out data); char character = (char)new PUACharacter(codePoint).Character; string category = langDef.GetOverrideCharCategory(character); if (!chars.Contains(character.ToString()) && (category == null || category[0] != 'M')) { langDef.RemovePuaDefinition(new CharDef(codePoint, data)); } } // Now make sure all the custom PUA characters specified in the valid characters // list are also added to the language definition's PUA collection. foreach (string chr in chars) { // Go through the codepoints in the character. foreach (char c in chr) UpdateLangDefPUACollection((int)c, langDef, customPuaCharacters); } }
/// ------------------------------------------------------------------------------------ /// <summary> /// Finds the first base character/combining diacritic combination and removes any /// remaining characters. /// </summary> /// <param name="origChars">The original string of characters.</param> /// <param name="langDef">The language definition which could override some of the /// properties defined by the <paramref name="cpe"/>.</param> /// <param name="cpe">The character property engine.</param> /// ------------------------------------------------------------------------------------ public static string ValidateCharacterSequence(string origChars, LanguageDefinition langDef, ILgCharacterPropertyEngine cpe) { // Allow spaces (Zs), hard line breaks (Zl), and other formatting characters (Cf) in // isolation only. if (origChars.Length == 1) { string category = (langDef == null) ? null : langDef.GetOverrideCharCategory(origChars[0]); switch (category) { case "Zl": case "Zs": case "Cf": return origChars; } if (category == null && (cpe.get_GeneralCategory(origChars[0]) == LgGeneralCharCategory.kccZl || cpe.get_GeneralCategory(origChars[0]) == LgGeneralCharCategory.kccZs || cpe.get_GeneralCategory(origChars[0]) == LgGeneralCharCategory.kccCf)) { return origChars; } } StringBuilder newChars = new StringBuilder(); bool baseFound = false; bool fPrecedingCharWasMark = false; // Extract first base character and any following diacritics for (int ich = 0; ich < origChars.Length; ich++ ) { char chr = origChars[ich]; if (!baseFound) { string category = (langDef == null) ? null : langDef.GetOverrideCharCategory(chr); if (category == null) { // If this is not a valid base character, keep looking. if (!cpe.get_IsLetter(chr) && !cpe.get_IsNumber(chr) && cpe.get_GeneralCategory(chr) != LgGeneralCharCategory.kccCo && !cpe.get_IsPunctuation(chr) && !cpe.get_IsSymbol(chr)) continue; } else { // If this is not a valid base character, keep looking. if (category[0] != 'L' && category[0] != 'N' && category != "Co" && category[0] != 'P' && category[0] != 'S') continue; } baseFound = true; } else { // If this is not a diacritic or a ZWJ or ZWNJ between diacritics, // discard the rest of the string. if (IsMark(chr, langDef, cpe)) { fPrecedingCharWasMark = true; } else if ((chr == '\u200C' || chr == '\u200D') && fPrecedingCharWasMark && origChars.Length > ich + 1 && IsMark(origChars[ich + 1], langDef, cpe)) { fPrecedingCharWasMark = false; } else { // This handles special situations like Korean, where multiple base letters // (representing phonemes) can compose into a single base letter (representing a // syllable). string composed = Icu.Normalize(origChars, Icu.UNormalizationMode.UNORM_NFKC); if (composed.Length == 1) return composed; break; } } if (baseFound) newChars.Append(chr); } return newChars.ToString(); }