public static void LoadUnicodeData() { const string UnicodeDataFileName = "UnicodeData.txt"; const string SpecialCasingFileName = "SpecialCasing.txt"; GetFile(UnicodeDataFileName); GetFile(SpecialCasingFileName); SortedList <uint, CodePoint> codePointsByValue = CodePoint.codePointsByValue; Dictionary <uint, CodePoint[]> uppercaseMappings = CodePoint.uppercaseMappings; Dictionary <uint, CodePoint[]> lowercaseMappings = CodePoint.lowercaseMappings; Dictionary <uint, CodePoint[]> titlecaseMappings = CodePoint.titlecaseMappings; char[] spaceArray = new char[] { ' ' }; char[] semicolonArray = new char[] { ';' }; #region Process UnicodeData file { #region Lookup dictionaries Dictionary <string, GeneralCategory> generalCategoryLookup; { GeneralCategory[] generalCategoryValues = (GeneralCategory[])Enum.GetValues(typeof(GeneralCategory)); generalCategoryLookup = new Dictionary <string, GeneralCategory>(generalCategoryValues.Length, StringComparer.Ordinal); for (int i = 0; i < generalCategoryValues.Length; i++) { GeneralCategory generalCategoryValue = generalCategoryValues[i]; generalCategoryLookup[generalCategoryValue.ToString("G")] = generalCategoryValue; } } Dictionary <string, BidiClass> bidiClassLookup; { BidiClass[] bidiClassValues = (BidiClass[])Enum.GetValues(typeof(BidiClass)); bidiClassLookup = new Dictionary <string, BidiClass>(bidiClassValues.Length, StringComparer.Ordinal); for (int i = 0; i < bidiClassValues.Length; i++) { BidiClass bidiClassValue = bidiClassValues[i]; bidiClassLookup[bidiClassValue.ToString("G")] = bidiClassValue; } } #endregion // Lookup dictionaries string[] unicodeDataLines = File.ReadAllLines(UnicodeDataFileName, Encoding.UTF8); for (int i = 0; i < unicodeDataLines.Length; i++) { string unicodeDataLine = unicodeDataLines[i]; if (!string.IsNullOrEmpty(unicodeDataLine) && unicodeDataLine[0] != '#') { string[] unicodeDataTokens = unicodeDataLine.Split(semicolonArray); Debug.Assert(unicodeDataTokens.Length >= 15); const int ValueIndex = 0; const int NameIndex = 1; const int GeneralCategoryIndex = 2; const int CanonicalCombiningClassIndex = 3; const int BidiClassIndex = 4; // 5, 6, 7, 8 omitted for the moment const int BidiMirroredIndex = 9; const int Unicode1NameIndex = 10; const int IsoCommentIndex = 11; const int SimpleUppercaseMappingIndex = 12; const int SimpleLowercaseMappingIndex = 13; const int SimpleTitlecaseMappingIndex = 14; uint value = uint.Parse(unicodeDataTokens[ValueIndex], NumberStyles.HexNumber, NumberFormatInfo.InvariantInfo); string name = unicodeDataTokens[NameIndex]; if (string.IsNullOrEmpty(name)) { name = NameDefault; } GeneralCategory generalCategory; { string generalCategoryString = unicodeDataTokens[GeneralCategoryIndex]; if (string.IsNullOrEmpty(generalCategoryString)) { generalCategory = GeneralCategory.Cn; } else { generalCategory = generalCategoryLookup[generalCategoryString]; } } byte canonicalCombiningClass; { string canonicalCombiningClassString = unicodeDataTokens[CanonicalCombiningClassIndex]; if (string.IsNullOrEmpty(canonicalCombiningClassString)) { canonicalCombiningClass = 0; } else { canonicalCombiningClass = byte.Parse(canonicalCombiningClassString, NumberStyles.Integer, NumberFormatInfo.InvariantInfo); } } BidiClass bidiClass; { string bidiClassString = unicodeDataTokens[BidiClassIndex]; if (string.IsNullOrEmpty(bidiClassString)) { bidiClass = BidiClass.Invalid; } else { bidiClass = bidiClassLookup[bidiClassString]; } } // 5, 6, 7, 8 omitted for the moment bool bidiMirrored = (unicodeDataTokens[BidiMirroredIndex] == "Y"); string unicode1Name = unicodeDataTokens[Unicode1NameIndex]; if (string.IsNullOrEmpty(unicode1Name)) { unicode1Name = null; } string isoComment = unicodeDataTokens[IsoCommentIndex]; if (string.IsNullOrEmpty(isoComment)) { isoComment = null; } uint?simpleUppercaseMapping; { string simpleUppercaseMappingString = unicodeDataTokens[SimpleUppercaseMappingIndex]; if (string.IsNullOrEmpty(simpleUppercaseMappingString)) { simpleUppercaseMapping = null; } else { simpleUppercaseMapping = uint.Parse(simpleUppercaseMappingString, NumberStyles.HexNumber, NumberFormatInfo.InvariantInfo); if (simpleUppercaseMapping.Value == value) { simpleUppercaseMapping = null; } } } uint?simpleLowercaseMapping; { string simpleLowercaseMappingString = unicodeDataTokens[SimpleLowercaseMappingIndex]; if (string.IsNullOrEmpty(simpleLowercaseMappingString)) { simpleLowercaseMapping = null; } else { simpleLowercaseMapping = uint.Parse(simpleLowercaseMappingString, NumberStyles.HexNumber, NumberFormatInfo.InvariantInfo); if (simpleLowercaseMapping.Value == value) { simpleLowercaseMapping = null; } } } uint?simpleTitlecaseMapping; { string simpleTitlecaseMappingString = unicodeDataTokens[SimpleTitlecaseMappingIndex]; if (string.IsNullOrEmpty(simpleTitlecaseMappingString)) { simpleTitlecaseMapping = null; } else { simpleTitlecaseMapping = uint.Parse(simpleTitlecaseMappingString, NumberStyles.HexNumber, NumberFormatInfo.InvariantInfo); if (simpleTitlecaseMapping.Value == value) { simpleTitlecaseMapping = null; } } } codePointsByValue[value] = new CodePoint(value, name, generalCategory, canonicalCombiningClass, bidiClass, bidiMirrored, unicode1Name, isoComment, simpleUppercaseMapping, simpleLowercaseMapping, simpleTitlecaseMapping); } } } #endregion // Process UnicodeData file #region Process SpecialCasing file { List <CodePoint> mappingCodePoints = new List <CodePoint>(); string[] specialCasingLines = File.ReadAllLines(SpecialCasingFileName, Encoding.UTF8); for (int i = 0; i < specialCasingLines.Length; i++) { string specialCasingLine = specialCasingLines[i]; if (!string.IsNullOrEmpty(specialCasingLine)) { int commentStartIndex = specialCasingLine.IndexOf('#'); if (commentStartIndex >= 0) { specialCasingLine = specialCasingLine.Remove(commentStartIndex).Trim(spaceArray); if (string.IsNullOrEmpty(specialCasingLine)) { continue; } } string[] specialCasingTokens = specialCasingLine.Split(semicolonArray); Debug.Assert(specialCasingTokens.Length >= 4); const int ValueIndex = 0; const int LowerMappingIndex = 1; const int TitleMappingIndex = 2; const int UpperMappingIndex = 3; const int ConditionIndex = 4; uint value = uint.Parse(specialCasingTokens[ValueIndex], NumberStyles.HexNumber, NumberFormatInfo.InvariantInfo); if (specialCasingTokens.Length >= 5 && !string.IsNullOrEmpty(specialCasingTokens[ConditionIndex].Trim(spaceArray))) { // We don't want any conditional mappings continue; } ProcessTokenStringForCodePoints(value, specialCasingTokens[LowerMappingIndex], spaceArray, mappingCodePoints, lowercaseMappings); ProcessTokenStringForCodePoints(value, specialCasingTokens[TitleMappingIndex], spaceArray, mappingCodePoints, titlecaseMappings); ProcessTokenStringForCodePoints(value, specialCasingTokens[UpperMappingIndex], spaceArray, mappingCodePoints, uppercaseMappings); } } } #endregion // Process SpecialCasing file }
/// <summary> /// Gets a description of the specified Unicode category. /// </summary> /// <param name="category">An enumerated constant that identifies Unicode category.</param> /// <returns></returns> public static string GetCategoryDescription(GeneralCategory category) { switch (category) { case GeneralCategory.AllControlCharacters: return "All control characters. This includes the Cc, Cf, Cs, Co, and Cn categories."; case GeneralCategory.AllDiacriticMarks: return "All diacritic marks. This includes the Mn, Mc, and Me categories."; case GeneralCategory.AllLetterCharacters: return "All letter characters. This includes the Lu, Ll, Lt, Lm, and Lo characters."; case GeneralCategory.AllNumbers: return "All numbers. This includes the Nd, Nl, and No categories."; case GeneralCategory.AllPunctuationCharacters: return "All punctuation characters. This includes the Pc, Pd, Ps, Pe, Pi, Pf, and Po categories."; case GeneralCategory.AllSeparatorCharacters: return "All separator characters. This includes the Zs, Zl, and Zp categories."; case GeneralCategory.AllSymbols: return "All symbols. This includes the Sm, Sc, Sk, and So categories."; case GeneralCategory.LetterLowercase: return "Letter, Lowercase"; case GeneralCategory.LetterModifier: return "Letter, Modifier"; case GeneralCategory.LetterOther: return "Letter, Other"; case GeneralCategory.LetterTitlecase: return "Letter, Titlecase"; case GeneralCategory.LetterUppercase: return "Letter, Uppercase"; case GeneralCategory.MarkEnclosing: return "Mark, Enclosing"; case GeneralCategory.MarkNonspacing: return "Mark, Nonspacing"; case GeneralCategory.MarkSpacingCombining: return "Mark, Spacing Combining"; case GeneralCategory.NumberDecimalDigit: return "Number, Decimal Digit"; case GeneralCategory.NumberLetter: return "Number, Letter"; case GeneralCategory.NumberOther: return "Number, Other"; case GeneralCategory.OtherControl: return "Other, Control"; case GeneralCategory.OtherFormat: return "Other, Format"; case GeneralCategory.OtherNotAssigned: return "Other, Not Assigned (no characters have this property)"; case GeneralCategory.OtherPrivateUse: return "Other, Private Use"; case GeneralCategory.OtherSurrogate: return "Other, Surrogate"; case GeneralCategory.PunctuationClose: return "Punctuation, Close"; case GeneralCategory.PunctuationConnector: return "Punctuation, Connector"; case GeneralCategory.PunctuationDash: return "Punctuation, Dash"; case GeneralCategory.PunctuationFinalQuote: return "Punctuation, Final quote (may behave like Ps or Pe depending on usage)"; case GeneralCategory.PunctuationInitialQuote: return "Punctuation, Initial quote (may behave like Ps or Pe depending on usage)"; case GeneralCategory.PunctuationOpen: return "Punctuation, Open"; case GeneralCategory.PunctuationOther: return "Punctuation, Other"; case GeneralCategory.SeparatorLine: return "Separator, Line"; case GeneralCategory.SeparatorParagraph: return "Separator, Paragraph"; case GeneralCategory.SeparatorSpace: return "Separator, Space"; case GeneralCategory.SymbolCurrency: return "Symbol, Currency"; case GeneralCategory.SymbolMath: return "Symbol, Math"; case GeneralCategory.SymbolModifier: return "Symbol, Modifier"; case GeneralCategory.SymbolOther: return "Symbol, Other"; default: Debug.Assert(false, category.ToString()); return ""; } }