public Group(UnicodeRange range, string groupName, string name, string ids, UnicodeChart chart) { UnicodeRange = new UnicodeRange(range); GroupName = groupName; Name = name; Ids = ids; UnicodeChart = chart; SubGroups = null; }
/// <summary> /// Define LineBreakProperty class, /// <a href="http://unicode.org/reports/tr13/tr13-5.html">Newline</a> /// </summary> public LineBreakProperty(UnicodeRange expectedRange) { if (!InitializeLineBreakCharDictionary(expectedRange)) { throw new ArgumentOutOfRangeException("LineBreakProperty, Linebreak ranges are beyond expected range, " + String.Format(CultureInfo.InvariantCulture, "0x{0:X}", expectedRange.StartOfUnicodeRange) + " - " + String.Format(CultureInfo.InvariantCulture, "0x{0:X}", expectedRange.EndOfUnicodeRange) + ". Refert to CR, LF, CRLF, NEL, VT, FF, LS, and PS."); } }
/// <summary> /// Get a random Unicode point (points if it is Surrogate) from the given range /// </summary> public static string GetRandomCodePoint(UnicodeRange range, int iterations, int [] exclusions, int seed) { Random rand = new Random(seed); int codePoint = 0; string retStr = string.Empty; if (null != exclusions) { Array.Sort(exclusions); } for (int i = 0; i < iterations; i++) { codePoint = rand.Next(range.StartOfUnicodeRange, range.EndOfUnicodeRange); if (null != exclusions) { int index = Array.BinarySearch(exclusions, codePoint); int ctr = 0; while (index >= 0) { codePoint = rand.Next(range.StartOfUnicodeRange, range.EndOfUnicodeRange); index = Array.BinarySearch(exclusions, codePoint); ctr++; if (MAXNUMITERATION == ctr) { throw new ArgumentOutOfRangeException("TextUtil, " + ctr + " loop has been reached. GetRandomCodePoint may have infinite loop." + " Range " + String.Format(CultureInfo.InvariantCulture, "0x{0:X}", range.StartOfUnicodeRange) + " - " + String.Format(CultureInfo.InvariantCulture, "0x{0:X}", range.EndOfUnicodeRange) + " are likely excluded "); } } } if (codePoint > 0xFFFF) { // In case it is surrogate retStr += Convert.ToChar((codePoint - 0x10000) / 0x400 + 0xD800); retStr += Convert.ToChar((codePoint - 0x10000) % 0x400 + 0xDC00); } else { retStr += Convert.ToChar(codePoint); } } return(retStr); }
/// <summary> /// Get new range - if expectedRange is smaller, new range is expectedRange. Otherwise, return false /// </summary> public static UnicodeRange GetRange(UnicodeRange range, UnicodeRange expectedRange) { if (0 == expectedRange.StartOfUnicodeRange && TextUtil.MaxUnicodePoint == expectedRange.EndOfUnicodeRange) { // don't care if whole Unicode range is given return(new UnicodeRange(range.StartOfUnicodeRange, range.EndOfUnicodeRange)); } if (expectedRange.StartOfUnicodeRange > range.EndOfUnicodeRange || expectedRange.EndOfUnicodeRange < range.StartOfUnicodeRange) { return(null); } int low = expectedRange.StartOfUnicodeRange > range.StartOfUnicodeRange ? expectedRange.StartOfUnicodeRange : range.StartOfUnicodeRange; int high = expectedRange.EndOfUnicodeRange < range.EndOfUnicodeRange ? expectedRange.EndOfUnicodeRange : range.EndOfUnicodeRange; return(new UnicodeRange(low, high)); }
/// <summary> /// Define SurrogatePairProperty class /// <a href="http://www.unicode.org/charts/PDF/UD800.pdf">Newline</a> /// <a href="http://www.unicode.org/charts/PDF/UDC00.pdf">Newline</a> /// </summary> public SurrogatePairProperty(UnicodeRangeDatabase unicodeDb, Collection <UnicodeRange> expectedRanges) { bool isValid = false; foreach (UnicodeRange range in expectedRanges) { if (RangePropertyCollector.BuildPropertyDataList( unicodeDb, range, surrogatePairRangeList, "Surrogates", GroupAttributes.GroupName)) { foreach (UnicodeRangeProperty data in surrogatePairRangeList) { if (data.Name.Equals("High Surrogates", StringComparison.OrdinalIgnoreCase)) { highMin = data.Range.StartOfUnicodeRange; highMax = data.Range.EndOfUnicodeRange; } else if (data.Name.Equals("Low Surrogates", StringComparison.OrdinalIgnoreCase)) { lowMin = data.Range.StartOfUnicodeRange; lowMax = data.Range.EndOfUnicodeRange; } } isValid = true; } surrogateRange = RangePropertyCollector.GetRange(new UnicodeRange(0x10000, TextUtil.MaxUnicodePoint), range); if (null != surrogateRange) { isValid = true; } } if (!isValid) { throw new ArgumentOutOfRangeException("expectedRanges", "SurrogatePairProperty, SurrogatePair ranges are beyond expected range. " + "Refert to Surrogates range and UTF32."); } }
private bool InitializeLineBreakCharDictionary(UnicodeRange expectedRange) { char [] cr = { '\u000D' }; lineBreakCharDictionary.Add("CR", cr); char [] lf = { '\u000A' }; lineBreakCharDictionary.Add("LF", lf); char [] crlf = { '\u000D', '\u000A' }; lineBreakCharDictionary.Add("CRLF", crlf); char [] nel = { '\u0085' }; lineBreakCharDictionary.Add("NEL", nel); char [] vt = { '\u000B' }; lineBreakCharDictionary.Add("VT", vt); char [] ff = { '\u000C' }; lineBreakCharDictionary.Add("FF", ff); char [] ls = { '\u2028' }; lineBreakCharDictionary.Add("LS", ls); char [] ps = { '\u2029' }; lineBreakCharDictionary.Add("PS", ps); int i = 0; bool isValid = false; lineBreakCodePoints = new int [cr.Length + lf.Length + crlf.Length + nel.Length + vt.Length + ff.Length + ls.Length + ps.Length]; Dictionary <string, char[]> .ValueCollection valueColl = lineBreakCharDictionary.Values; foreach (char[] values in valueColl) { foreach (char codePoint in values) { if (codePoint >= expectedRange.StartOfUnicodeRange && codePoint <= expectedRange.EndOfUnicodeRange) { lineBreakCodePoints[i++] = (int)codePoint; isValid = true; } } } Array.Resize(ref lineBreakCodePoints, i); return(isValid); }
/// <summary> /// Walk through Unicode range database to build up property according to Group attribute /// </summary> public static bool BuildPropertyDataList( UnicodeRangeDatabase unicodeDb, UnicodeRange expectedRange, List <UnicodeRangeProperty> dataList, string name, GroupAttributes attribute) { bool isAdded = false; foreach (Group script in unicodeDb.Scripts) { string scriptAttrib = script.GroupName; if (attribute == GroupAttributes.Name) { scriptAttrib = script.Name; } else if (attribute == GroupAttributes.Ids) { scriptAttrib = script.Ids; } if (scriptAttrib.Equals(name, StringComparison.OrdinalIgnoreCase)) { UnicodeRange range = GetRange(script.UnicodeRange, expectedRange); if (null != range) { dataList.Add(new UnicodeRangeProperty(TextUtil.UnicodeChartType.Script, script.Name, script.Ids, range)); isAdded = true; } if (null != script.SubGroups) { foreach (SubGroup subScript in script.SubGroups) { range = GetRange(subScript.UnicodeRange, expectedRange); if (null != range) { dataList.Add(new UnicodeRangeProperty( TextUtil.UnicodeChartType.Script, subScript.SubGroupName, subScript.SubIds, range)); isAdded = true; } } } } } foreach (Group symbol in unicodeDb.SymbolsAndPunctuation) { string symbolAttrib = symbol.GroupName; if (attribute == GroupAttributes.Name) { symbolAttrib = symbol.Name; } else if (attribute == GroupAttributes.Ids) { symbolAttrib = symbol.Ids; } if (symbolAttrib.Equals(name, StringComparison.OrdinalIgnoreCase)) { TextUtil.UnicodeChartType type = TextUtil.UnicodeChartType.Other; if ((symbol.GroupName.ToLower(CultureInfo.InvariantCulture)).Contains("symbols") || (symbol.Name.ToLower(CultureInfo.InvariantCulture)).Contains("symbols")) { type = TextUtil.UnicodeChartType.Symbol; } else if ((symbol.GroupName.ToLower(CultureInfo.InvariantCulture)).Contains("punctuation") || (symbol.Name.ToLower(CultureInfo.InvariantCulture)).Contains("punctuation")) { type = TextUtil.UnicodeChartType.Punctuation; } UnicodeRange range = GetRange(symbol.UnicodeRange, expectedRange); if (null != range) { dataList.Add(new UnicodeRangeProperty(type, symbol.Name, symbol.Ids, range)); isAdded = true; } if (null != symbol.SubGroups) { foreach (SubGroup subSymbol in symbol.SubGroups) { range = GetRange(subSymbol.UnicodeRange, expectedRange); if (null != range) { dataList.Add(new UnicodeRangeProperty(type, subSymbol.SubGroupName, subSymbol.SubIds, range)); isAdded = true; } } } } } return(isAdded); }
/// <summary> /// Copy constructor /// </summary> /// <param name="range">A UnicodeRange object to be copied</param> public UnicodeRange(UnicodeRange range) { startOfUnicodeRange = range.StartOfUnicodeRange; endOfUnicodeRange = range.EndOfUnicodeRange; }
/// <summary> /// constructor of PropertyData stuct /// </summary> public UnicodeRangeProperty(TextUtil.UnicodeChartType type, string name, string ids, UnicodeRange range) { Type = type; Name = name; CultureIDs = ids; Range = new UnicodeRange(range.StartOfUnicodeRange, range.EndOfUnicodeRange); }
/// <summary> /// Define SurrogatePairDictionary class /// <a href="http://www.unicode.org/reports/tr15/">Newline</a> /// <a href="http://www.unicode.org/charts/normalization/">Newline</a> /// </summary> public TextNormalizationProperty(UnicodeRangeDatabase unicodeDb, UnicodeRange expectedRange) { bool isValid = false; if (RangePropertyCollector.BuildPropertyDataList( unicodeDb, expectedRange, textNormalizationRangeList, "Latin", GroupAttributes.Name)) { isValid = true; } if (RangePropertyCollector.BuildPropertyDataList( unicodeDb, expectedRange, textNormalizationRangeList, "CJK Unified Ideographs (Han)", GroupAttributes.Name)) { isValid = true; } if (RangePropertyCollector.BuildPropertyDataList( unicodeDb, expectedRange, textNormalizationRangeList, "CJK Compatibility Ideographs", GroupAttributes.Name)) { isValid = true; } if (RangePropertyCollector.BuildPropertyDataList( unicodeDb, expectedRange, textNormalizationRangeList, "Katakana", GroupAttributes.Name)) { isValid = true; } if (RangePropertyCollector.BuildPropertyDataList( unicodeDb, expectedRange, textNormalizationRangeList, "Hangul Jamo", GroupAttributes.Name)) { isValid = true; } if (RangePropertyCollector.BuildPropertyDataList( unicodeDb, expectedRange, textNormalizationRangeList, "Hangul Syllables", GroupAttributes.Name)) { isValid = true; } if (RangePropertyCollector.BuildPropertyDataList( unicodeDb, expectedRange, textNormalizationRangeList, "Arabic", GroupAttributes.Name)) { isValid = true; } if (RangePropertyCollector.BuildPropertyDataList( unicodeDb, expectedRange, textNormalizationRangeList, "Greek", GroupAttributes.Name)) { isValid = true; } if (InitializeTextNormalizationPropertyDictionary(expectedRange)) { isValid = true; } if (!isValid) { throw new ArgumentOutOfRangeException("TextNormalizationProperty, code points for text normalization ranges are beyond expected range, " + String.Format(CultureInfo.InvariantCulture, "0x{0:X}", expectedRange.StartOfUnicodeRange) + " - " + String.Format(CultureInfo.InvariantCulture, "0x{0:X}", expectedRange.EndOfUnicodeRange) + ". Refert to Latin, CJK Unified Ideographs (Han) " + "CJK Compatibility Ideographs, Katakana, Hangul Jamo, Hangul Syllables, Arabic, and Greek ranges."); } }
private void CreateProperties(StringProperties properties, UnicodeRangeDatabase unicodeDb, UnicodeRange expectedRange) { if (null != properties.HasNumbers) { if ((bool)properties.HasNumbers) { numberProperty = new NumberProperty(unicodeDb, expectedRange); minNumOfCodePoint += NumberProperty.MINNUMOFCODEPOINT; propertyDictionary.Add(PropertyName.Number, numberProperty); } } if (null != properties.IsBidirectional) { if ((bool)properties.IsBidirectional) { bidiProperty = new BidiProperty(unicodeDb, expectedRange); minNumOfCodePoint += BidiProperty.MINNUMOFCODEPOINT; propertyDictionary.Add(PropertyName.Bidi, bidiProperty); } } if (null != properties.NormalizationForm) { textNormalizationProperty = new TextNormalizationProperty(unicodeDb, expectedRange); minNumOfCodePoint += TextNormalizationProperty.MINNUMOFCODEPOINT; propertyDictionary.Add(PropertyName.TextNormalization, textNormalizationProperty); } if (null != properties.MinNumberOfCombiningMarks) { if (0 != properties.MinNumberOfCombiningMarks) { combiningMarksProperty = new CombiningMarksProperty(unicodeDb, expectedRange); minNumOfCodePoint += CombiningMarksProperty.MINNUMOFCODEPOINT * (int)properties.MinNumberOfCombiningMarks; propertyDictionary.Add(PropertyName.CombiningMarks, combiningMarksProperty); } } if (null != properties.MinNumberOfEndUserDefinedCodePoints) { if (0 != properties.MinNumberOfEndUserDefinedCodePoints) { eudcProperty = new EudcProperty(unicodeDb, expectedRange); minNumOfCodePoint += EudcProperty.MINNUMOFCODEPOINT * (int)properties.MinNumberOfEndUserDefinedCodePoints; propertyDictionary.Add(PropertyName.Eudc, eudcProperty); } } if (null != properties.MinNumberOfLineBreaks) { if (0 != properties.MinNumberOfLineBreaks) { lineBreakProperty = new LineBreakProperty(expectedRange); minNumOfCodePoint += LineBreakProperty.MINNUMOFCODEPOINT * (int)properties.MinNumberOfLineBreaks; propertyDictionary.Add(PropertyName.LineBreak, lineBreakProperty); } } if (null != properties.MinNumberOfSurrogatePairs) { if (0 != properties.MinNumberOfSurrogatePairs) { surrogatePairProperty = new SurrogatePairProperty(unicodeDb, expectedRange); minNumOfCodePoint += SurrogatePairProperty.MINNUMOFCODEPOINT * (int)properties.MinNumberOfSurrogatePairs; propertyDictionary.Add(PropertyName.Surrogate, surrogatePairProperty); } } if (null != properties.MinNumberOfTextSegmentationCodePoints) { if (0 != properties.MinNumberOfTextSegmentationCodePoints) { textSegmentationProperty = new TextSegmentationProperty(unicodeDb, expectedRange); minNumOfCodePoint += TextSegmentationProperty.MINNUMOFCODEPOINT * (int)properties.MinNumberOfTextSegmentationCodePoints; propertyDictionary.Add(PropertyName.TextSegmentation, textSegmentationProperty); } } }
private bool InitializeDictionaries(UnicodeRange expectedRange) { char [] ko = { '\u1100', '\u1161', '\u11A8' }; sampleGraphemeClusterDictionary.Add("Ko", ko); char [] ta = { '\u0BA8', '\u0BBF' }; sampleGraphemeClusterDictionary.Add("ta", ta); char [] th = { '\u0E40', '\u0E01' }; sampleGraphemeClusterDictionary.Add("th", th); char [] devanagari = { '\u0937', '\u093F', '\u0915', '\u094D', '\u0937', '\u093F' }; sampleGraphemeClusterDictionary.Add("devanagari", devanagari); char [] sk = { '\u0063', '\u0068' }; sampleGraphemeClusterDictionary.Add("sk", sk); char [] other = { '\u0067', '\u0308', '\u006B', '\u02B7' }; sampleGraphemeClusterDictionary.Add("other", other); char [] all = { '\u000D', '\u000A', '\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u0009', '\u000B', '\u000C', '\u000E', '\u000F', '\u0010', '\u0011', '\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019','\u001A', '\u001B', '\u001C', '\u001D', '\u001E', '\u001F', '\u0020', '\u007F', '\u0080', '\u0081', '\u0082', '\u0083', '\u0084', '\u0085','\u0086', '\u0087', '\u0088', '\u0089', '\u008A', '\u008B', '\u008C', '\u008D', '\u008E', '\u008F', '\u0090', '\u0091', '\u0092', '\u0093','\u0094', '\u0095', '\u0096', '\u0097', '\u0098', '\u0099', '\u009A', '\u009B', '\u009C', '\u009D', '\u009E', '\u009F', '\u00A0', '\u00AD','\u2000', '\u2001', '\u2002', '\u2003', '\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200A' }; graphemeClusterBreakPropertyValuesDictionary.Add("all", all); char [] th1 = { '\u0E30', '\u0E32', '\u0E33', '\u0E40', '\u0E41', '\u0E42', '\u0E43', '\u0E44', '\u0E45' }; graphemeClusterBreakPropertyValuesDictionary.Add("th", th1); char [] lao = { '\u0EB0', '\u0EB2', '\u0EB3', '\u0EC0', '\u0EC1', '\u0EC2', '\u0EC3', '\u0EC4' }; graphemeClusterBreakPropertyValuesDictionary.Add("lao", lao); char [] ko1 = { '\u1100', '\u1101', '\u1102', '\u1103', '\u1104', '\u1105', '\u1106', '\u1107', '\u1108', '\u1109', '\u110A', '\u110B', '\u110C', '\u110D', '\u110E', '\u110F', '\u1110', '\u1111', '\u1112', '\u1113', '\u1114', '\u1115', '\u1116', '\u1117', '\u1118', '\u1119','\u111A', '\u111B', '\u111C', '\u111D', '\u111E', '\u111F', '\u1120', '\u1121', '\u1122', '\u1123', '\u1124', '\u1125', '\u1126', '\u1127','\u1128', '\u1129', '\u112A', '\u112B', '\u112C', '\u112D', '\u112E', '\u112F', '\u1130', '\u1131', '\u1132', '\u1133', '\u1134', '\u1135','\u1136', '\u1137', '\u1138', '\u1139', '\u1140', '\u1141', '\u1142', '\u1143', '\u1144', '\u1145', '\u1146', '\u1147', '\u1148', '\u1149','\u114A', '\u114B', '\u114C', '\u114D', '\u114E', '\u114F', '\u1150', '\u1151', '\u1152', '\u1153', '\u1154', '\u1155', '\u1156', '\u1157','\u1158', '\u1159', '\u111F', '\u1160', '\u1161', '\u1162', '\u1163', '\u1164', '\u1165', '\u1166', '\u1167', '\u1168', '\u1169', '\u116A','\u116B', '\u116C', '\u116D', '\u116E', '\u116F', '\u1170', '\u1171', '\u1172', '\u1173', '\u1174', '\u1175', '\u1176', '\u1177', '\u1178','\u1179', '\u117A', '\u117B', '\u117C', '\u117D', '\u117E', '\u117F', '\u1180', '\u1181', '\u1182', '\u1183', '\u1184', '\u1185', '\u1186','\u1187', '\u1188', '\u1189', '\u118A', '\u118B', '\u118C', '\u118D', '\u118E', '\u118F', '\u1190', '\u1191', '\u1192', '\u1193', '\u1194','\u1195', '\u1196', '\u1197', '\u1198', '\u1199', '\u119A', '\u119B', '\u119C', '\u119E', '\u119F', '\u11A0', '\u11A1', '\u11A2', '\u11A8','\u11A9', '\u11AA', '\u11AB', '\u11AC', '\u11AD', '\u11AE', '\u11AF', '\u11B0', '\u11B1', '\u11B2', '\u11B3', '\u11B4', '\u11B5', '\u11B6','\u11B7', '\u11B8', '\u11B9', '\u11BA', '\u11BB', '\u11BC', '\u11BD', '\u11BE', '\u11BF', '\u11C0', '\u11C1', '\u11C2', '\u11C3', '\u11C4','\u11C5', '\u11C6', '\u11C7', '\u11C8', '\u11C9', '\u11CA', '\u11CB', '\u11CC', '\u11CE', '\u11CF', '\u11D0', '\u11D1', '\u11D2', '\u11D3','\u11D4', '\u11D5', '\u11D6', '\u11D7', '\u11D8', '\u11D9', '\u11DA', '\u11DB', '\u11DC', '\u11DE', '\u11DF', '\u11F0', '\u11F1', '\u11F2','\u11F3', '\u11F4', '\u11F5', '\u11F7', '\u11F8', '\u11F9', '\uAC00', '\uAC1C', '\uAC38', '\uAC01', '\uAC02', '\uAC03', '\uAc04' }; graphemeClusterBreakPropertyValuesDictionary.Add("ko", ko1); char [] all1 = { '\u000A', '\u000D', '\u000B', '\u000C', '\u0020', '\u0027', '\u0085', '\u002D', '\u002E', '\u202F', '\u00A0', '\u2028', '\u2029', '\u2000', '\u2001', '\u2002', '\u2003', '\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200A', '\u2010', '\u2011','\u2018', '\u2019', '\u201B', '\u2024', '\uFE52', '\uFF07', '\uFF0E', '\u00B7', '\u05F4', '\u2027', '\u003A', '\u0387', '\uFE13', '\uFE55','\uFF1A', '\u066C', '\uFE50', '\uFE54', '\uFE63', '\uFF0D', '\uFF0C', '\uFF1B' }; wordBreakPropertyValuesDictionary.Add("all", all1); char [] katakana = { '\u3031', '\u3032', '\u3033', '\u3034', '\u3035', '\u309B', '\u309C', '\u30A0', '\u30FC', '\uFF70' }; wordBreakPropertyValuesDictionary.Add("ja", katakana); char [] he = { '\u05F3' }; wordBreakPropertyValuesDictionary.Add("he", he); char [] hy = { '\u055A', '\u058A' }; wordBreakPropertyValuesDictionary.Add("hy", hy); char [] tibet = { '\u0F0B' }; wordBreakPropertyValuesDictionary.Add("tibet", tibet); char [] mongolia = { '\u1806' }; wordBreakPropertyValuesDictionary.Add("mongolia", mongolia); char [] all2 = { '\u000A', '\u000D', '\u0085', '\u00A0', '\u05F3', '\u2000', '\u2001', '\u2002', '\u2003', '\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200A', '\u2028', '\u2029', '\u002E', '\u2024', '\uFE52', '\uFF0E', '\u002D', '\u003A', '\u055D', '\u060C','\u060D', '\u07F8', '\u1802', '\u1808', '\u2013', '\u2014', '\u3001', '\uFE10', '\uFE11', '\uFE13', '\uFE31', '\uFE32', '\uFE50', '\uFE51','\uFE55', '\uFE58', '\uFE63', '\uFF0C', '\uFF0D', '\uFF1A', '\uFF64' }; sentenceBreakPropertyValuesDictionary.Add("all", all2); bool isValid = false; int i = 0; textSegmentationCodePoints = new int [ko.Length + ta.Length + th.Length + devanagari.Length + sk.Length + other.Length + all.Length + th1.Length + lao.Length + ko1.Length + all1.Length + katakana.Length + he.Length + hy.Length + tibet.Length + mongolia.Length + all2.Length]; Dictionary <string, char[]> .ValueCollection valueColl1 = sampleGraphemeClusterDictionary.Values; foreach (char [] values in valueColl1) { foreach (char codePoint in values) { if (codePoint >= expectedRange.StartOfUnicodeRange && codePoint <= expectedRange.EndOfUnicodeRange) { textSegmentationCodePoints[i++] = (int)codePoint; isValid = true; } } } Dictionary <string, char[]> .ValueCollection valueColl2 = graphemeClusterBreakPropertyValuesDictionary.Values; foreach (char [] values in valueColl2) { foreach (char codePoint in values) { if (codePoint >= expectedRange.StartOfUnicodeRange && codePoint <= expectedRange.EndOfUnicodeRange) { textSegmentationCodePoints[i++] = (int)codePoint; isValid = true; } } } Dictionary <string, char[]> .ValueCollection valueColl3 = wordBreakPropertyValuesDictionary.Values; foreach (char [] values in valueColl3) { foreach (char codePoint in values) { if (codePoint >= expectedRange.StartOfUnicodeRange && codePoint <= expectedRange.EndOfUnicodeRange) { textSegmentationCodePoints[i++] = (int)codePoint; isValid = true; } } } Dictionary <string, char[]> .ValueCollection valueColl4 = sentenceBreakPropertyValuesDictionary.Values; foreach (char [] values in valueColl4) { foreach (char codePoint in values) { if (codePoint >= expectedRange.StartOfUnicodeRange && codePoint <= expectedRange.EndOfUnicodeRange) { textSegmentationCodePoints[i++] = (int)codePoint; isValid = true; } } } Array.Resize(ref textSegmentationCodePoints, i); Array.Sort(textSegmentationCodePoints); return(isValid); }
private static void InitializeProperties() { if (null != properties.UnicodeRange) { range = new UnicodeRange(properties.UnicodeRange.StartOfUnicodeRange, properties.UnicodeRange.EndOfUnicodeRange); } else { range = new UnicodeRange(0, TextUtil.MaxUnicodePoint); } // Validation for Unicode range provided against each property is done when each property is created propertyFactory = new PropertyFactory(properties, database, range); // Combining mark property needs latin alphabet if (propertyFactory.HasProperty(PropertyFactory.PropertyName.CombiningMarks)) { InitializeAlphabetRangeList(); } // Get minimum number of points minNumCodePoints = propertyFactory.MinNumOfCodePoint; if (null == properties.MinNumberOfCodePoints && null == properties.MaxNumberOfCodePoints) { if (minNumCodePoints < propertyFactory.MinNumOfCodePoint) { minNumCodePoints = propertyFactory.MinNumOfCodePoint; } maxNumCodePoints = TextUtil.MAXNUMOFCODEPOINT; if (minNumCodePoints > maxNumCodePoints) { throw new ArgumentOutOfRangeException( "StringFactory, maximum number of code points is greater than maximum allowed " + maxNumCodePoints + "."); } } else if (null != properties.MinNumberOfCodePoints && null == properties.MaxNumberOfCodePoints) { minNumCodePoints = (int)properties.MinNumberOfCodePoints; if (minNumCodePoints > TextUtil.MAXNUMOFCODEPOINT) { throw new ArgumentOutOfRangeException( "StringFactory, maximum number of code points allowed is " + TextUtil.MAXNUMOFCODEPOINT + "."); } maxNumCodePoints = TextUtil.MAXNUMOFCODEPOINT; } else if (null == properties.MinNumberOfCodePoints && null != properties.MaxNumberOfCodePoints) { maxNumCodePoints = (int)properties.MaxNumberOfCodePoints; if (maxNumCodePoints < propertyFactory.MinNumOfCodePoint) { throw new ArgumentOutOfRangeException( "StringFactory, minimum number of code points needed is " + propertyFactory.MinNumOfCodePoint + "."); } minNumCodePoints = propertyFactory.MinNumOfCodePoint; } else { minNumCodePoints = (int)properties.MinNumberOfCodePoints; maxNumCodePoints = (int)properties.MaxNumberOfCodePoints; if (minNumCodePoints > maxNumCodePoints) { throw new ArgumentOutOfRangeException("StringFactory, MinNumberOfCodePoints, " + minNumCodePoints + " cannot be bigger than " + "MaxNumberOfCodePoints, " + maxNumCodePoints + "."); } } }
/// <summary> /// Define SurrogatePairDictionary class /// <a href="http://unicode.org/reports/tr9/">Newline</a> /// </summary> public BidiProperty(UnicodeRangeDatabase unicodeDb, UnicodeRange expectedRange) { bool isValid = false; if (RangePropertyCollector.BuildPropertyDataList( unicodeDb, expectedRange, bidiPropertyRangeList, "Arabic", GroupAttributes.Name)) { isValid = true; } if (RangePropertyCollector.BuildPropertyDataList( unicodeDb, expectedRange, bidiPropertyRangeList, "Hebrew", GroupAttributes.Name)) { isValid = true; } if (InitializeBidiDictionary(expectedRange)) { isValid = true; } if (!isValid) { throw new ArgumentOutOfRangeException("BidiProperty, Bidi ranges are beyond expected range, " + String.Format(CultureInfo.InvariantCulture, "0x{0:X}", expectedRange.StartOfUnicodeRange) + " - " + String.Format(CultureInfo.InvariantCulture, "0x{0:X}", expectedRange.EndOfUnicodeRange) + ". Refer to Arabic and Hebrew ranges."); } UnicodeRange range = RangePropertyCollector.GetRange(new UnicodeRange(0x0030, 0x0039), expectedRange); if (null != range) { latinRangeList.Add(range); isValid = true; } range = RangePropertyCollector.GetRange(new UnicodeRange(0x0041, 0x005A), expectedRange); if (null != range) { latinRangeList.Add(range); isValid = true; } range = RangePropertyCollector.GetRange(new UnicodeRange(0x0061, 0x007A), expectedRange); if (null != range) { latinRangeList.Add(range); isValid = true; } if (!isValid) { throw new ArgumentOutOfRangeException("BidiProperty, Bidi ranges are beyond expected range, " + String.Format(CultureInfo.InvariantCulture, "0x{0:X}", expectedRange.StartOfUnicodeRange) + " - " + String.Format(CultureInfo.InvariantCulture, "0x{0:X}", expectedRange.EndOfUnicodeRange) + ". 0x0030 - 0x0039, 0x0041 - 0x005A, and 0x0061 - 0x007A ranges are needed to construct Bidi string."); } }
private bool InitializeCombiningMarksDictionary(UnicodeRange expectedRange) { // Grave and acute accent char [] other = { '\u0302', '\u0307', '\u030A', '\u0315', '\u0316', '\u0317', '\u0318', '\u0319', '\u031A', '\u031C', '\u031D', '\u031E', '\u031F', '\u0320', '\u0321', '\u0322', '\u0324', '\u032A', '\u032B', '\u032C', '\u032E', '\u0330','\u0332', '\u0333', '\u0334', '\u0335', '\u0336', '\u0337', '\u0338', '\u0339', '\u033A', '\u033B', '\u033C', '\u033D','\u033F', '\u0346', '\u0347', '\u0348', '\u0349', '\u034A', '\u034B', '\u034C', '\u034D', '\u034E', '\u034F', '\u0358','\u0359', '\u035A', '\u035B', '\u035C', '\u035D', '\u035E', '\u0360', '\u0361', '\u0362', '\u0323', '\u0328', '\u032D','\u032F', '\u1DC8', '\u1DC9', '\u1DCA', '\u1DCE', '\u1DCF', '\u1DD0', '\u1DD1', '\u1DD2', '\u1DD3', '\u1DD4', '\u1DD5','\u1DD6', '\u1DD7', '\u1DD8', '\u1DD9', '\u1DDA', '\u1DDB', '\u1DDC', '\u1DDD', '\u1DDE', '\u1DDF', '\u1DE0', '\u1DE1','\u1DE2', '\u1DE3', '\u1DE4', '\u1DE5', '\u1DE6', '\uFE20', '\uFE21', '\uFE22', '\uFE23' }; combiningMarksDictionary.Add("other", other); char [] vi = { '\u0303', '\u0308', '\u031B', '\u0323', '\u0340', '\u0341' }; combiningMarksDictionary.Add("vi", vi); char [] el = { '\u0300', '\u0301', '\u0304', '\u0305', '\u0306', '\u0308', '\u0313', '\u0314', '\u0331', '\u0342', '\u0343', '\u0344', '\u0345', '\u1DC0', '\u1DC1', '\u1DC4', '\u1DC5', '\u1DC6', '\u1DC7', '\uFE24', '\uFE25', '\uFE26' }; combiningMarksDictionary.Add("el", el); char [] hu = { '\u030B', '\u0350', '\u0351', '\u0352', '\u0353', '\u0354', '\u0355', '\u0356', '\u0357' }; combiningMarksDictionary.Add("hu", hu); char [] cs = { '\u030C' }; combiningMarksDictionary.Add("cs", cs); char [] id = { '\u030D', '\u030E', '\u0325' }; combiningMarksDictionary.Add("id", id); char [] ms = { '\u030D', '\u030E' }; combiningMarksDictionary.Add("ms", ms); char [] srsp = { '\u030F', '\u0311', '\u0313', '\u0314', '\u033E', '\u0351', '\u0352', '\u0353', '\u0354', '\u0355', '\u0356', '\u0357', '\u1DC3' }; combiningMarksDictionary.Add("sr-sp", srsp); char [] hr = { '\u030F', '\u1DC3' }; combiningMarksDictionary.Add("hr", hr); char [] hi = { '\u0310', '\u0325' }; combiningMarksDictionary.Add("hi", hi); char [] azaz = { '\u0311', '\u0313', '\u0314', '\u033E', '\u0327' }; combiningMarksDictionary.Add("az-az", azaz); char [] uzuz = { '\u0311', '\u0313', '\u0314', '\u033E' }; combiningMarksDictionary.Add("uz-uz", uzuz); char [] lv = { '\u0312', '\u0326' }; combiningMarksDictionary.Add("lv", lv); char [] fi = { '\u0326', '\u0350', '\u0351', '\u0352', '\u0353', '\u0354', '\u0355', '\u0356', '\u0357' }; combiningMarksDictionary.Add("fi", fi); char [] hy = { '\u0313', '\u0314' }; combiningMarksDictionary.Add("hy", hy); char [] he = { '\u0323' }; combiningMarksDictionary.Add("he", he); char [] ar = { '\u0323' }; combiningMarksDictionary.Add("ar", ar); char [] ro = { '\u0326', '\u0350', '\u0351', '\u0352', '\u0353', '\u0354', '\u0355', '\u0356', '\u0357' }; combiningMarksDictionary.Add("ro", ro); char [] fr = { '\u0327' }; combiningMarksDictionary.Add("fr", fr); char [] tr = { '\u0327' }; combiningMarksDictionary.Add("tr", tr); char [] pl = { '\u0328' }; combiningMarksDictionary.Add("pl", pl); char [] lt = { '\u0328', '\u035B', '\u1DCB', '\u1DCC' }; combiningMarksDictionary.Add("lt", lt); char [] yoruba = { '\u0329' }; combiningMarksDictionary.Add("yoruba", yoruba); char [] de = { '\u0329', '\u0363', '\u0364', '\u0365', '\u0366', '\u0367', '\u0368', '\u0369', '\u036A', '\u036B', '\u036C', '\u036D', '\u036E', '\u036F' }; combiningMarksDictionary.Add("de", de); char [] et = { '\u0350', '\u0351', '\u0352', '\u0353', '\u0354', '\u0355', '\u0356', '\u0357' }; combiningMarksDictionary.Add("et", et); char [] ru = { '\u030B', '\u0350', '\u0351', '\u0352', '\u0353', '\u0354', '\u0355', '\u0356', '\u0357', '\u1DC3' }; combiningMarksDictionary.Add("ru", ru); char [] sk = { '\u0351', '\u0352', '\u0353', '\u0354', '\u0355', '\u0356', '\u0357', '\u1DC3' }; combiningMarksDictionary.Add("sk", sk); char [] be = { '\u1DC3' }; combiningMarksDictionary.Add("be", be); char [] bg = { '\u1DC3' }; combiningMarksDictionary.Add("bg", be); char [] mk = { '\u1DC3' }; combiningMarksDictionary.Add("mk", mk); char [] sl = { '\u1DC3' }; combiningMarksDictionary.Add("sl", sl); char [] uk = { '\u1DC3' }; combiningMarksDictionary.Add("uk", uk); char [] symbol = { '\u20D0', '\u20D1', '\u20D2', '\u20D3', '\u20D4', '\u20D5', '\u20D6', '\u20D7', '\u20D8', '\u20D9', '\u20DA', '\u20DF', '\u20E0', '\u20E1', '\u20E2', '\u20E3', '\u20E4', '\u20E5', '\u20E6', '\u20E7', '\u20E8', '\u20E9','\u20EA', '\u20EB', '\u20EC', '\u20ED', '\u20EF', '\u20F0' }; combiningMarksDictionary.Add("symbol", symbol); bool isValid = false; int i = 0; combiningMarks = new int [other.Length + vi.Length + el.Length + hu.Length + cs.Length + id.Length + ms.Length + srsp.Length + hr.Length + hi.Length + azaz.Length + uzuz.Length + lv.Length + fi.Length + hy.Length + he.Length + ar.Length + ro.Length + fr.Length + tr.Length + pl.Length + lt.Length + yoruba.Length + de.Length + et.Length + ru.Length + sk.Length + be.Length + bg.Length + mk.Length + sl.Length + uk.Length + symbol.Length]; Dictionary <string, char[]> .ValueCollection valueColl = combiningMarksDictionary.Values; foreach (char [] values in valueColl) { foreach (char codePoint in values) { if (codePoint >= expectedRange.StartOfUnicodeRange && codePoint <= expectedRange.EndOfUnicodeRange) { combiningMarks[i++] = (int)codePoint; isValid = true; } } } Array.Resize(ref combiningMarks, i); return(isValid); }
/// <summary> /// Create property objects according to string properties /// </summary> public PropertyFactory(StringProperties properties, UnicodeRangeDatabase unicodeDb, UnicodeRange expectedRange) { bidiProperty = null; combiningMarksProperty = null; eudcProperty = null; lineBreakProperty = null; numberProperty = null; surrogatePairProperty = null; textNormalizationProperty = null; textSegmentationProperty = null; minNumOfCodePoint = 0; propertyDictionary = new Dictionary <PropertyFactory.PropertyName, IStringProperty>(); CreateProperties(properties, unicodeDb, expectedRange); }
/// <summary> /// Define SurrogatePairDictionary class /// <a href="http://unicode.org/reports/tr9/">Newline</a> /// </summary> public BidiProperty(UnicodeRangeDatabase unicodeDb, Collection <UnicodeRange> expectedRanges) { bool isValid = false; foreach (UnicodeRange range in expectedRanges) { if (RangePropertyCollector.BuildPropertyDataList( unicodeDb, range, bidiPropertyRangeList, "Arabic", GroupAttributes.Name)) { isValid = true; } if (RangePropertyCollector.BuildPropertyDataList( unicodeDb, range, bidiPropertyRangeList, "Hebrew", GroupAttributes.Name)) { isValid = true; } } if (InitializeBidiDictionary(expectedRanges)) { isValid = true; } if (!isValid) { throw new ArgumentOutOfRangeException("expectedRanges", "BidiProperty, Bidi ranges are beyond expected range. " + "Refer to Arabic and Hebrew ranges."); } // Reset isValid to validate Latin range isValid = false; foreach (UnicodeRange expectedRange in expectedRanges) { UnicodeRange range = RangePropertyCollector.GetRange(new UnicodeRange(0x0030, 0x0039), expectedRange); if (null != range) { latinRangeList.Add(range); isValid = true; } range = RangePropertyCollector.GetRange(new UnicodeRange(0x0041, 0x005A), expectedRange); if (null != range) { latinRangeList.Add(range); isValid = true; } range = RangePropertyCollector.GetRange(new UnicodeRange(0x0061, 0x007A), expectedRange); if (null != range) { latinRangeList.Add(range); isValid = true; } } if (!isValid) { throw new ArgumentOutOfRangeException("expectedRanges", "BidiProperty, Bidi ranges are beyond expected range. " + "0x0030 - 0x0039, 0x0041 - 0x005A, and 0x0061 - 0x007A ranges are needed to construct Bidi string."); } }
/// <summary> /// Dictionary to store code points corresponding to culture. /// </summary> private bool InitializeTextNormalizationPropertyDictionary(UnicodeRange expectedRange) { int [] othersymbols = { 0xFFE4, 0x21CD, 0xFFE8, 0xFFED, 0xFFEE, 0x3036, 0x1D15E, 0x1D15F, 0x1D160, 0x1D161, 0x1D162, 0x1D163, 0x1D164, 0x1D1BB, 0x1D1BD, 0x1D1BF, 0x1D1BC, 0x1D1BE, 0x1D1C0 }; textNormalizationPropertyDictionary.Add("othersymbols", othersymbols); int [] modifiersymbols = { 0x00B4, 0x0384, 0x1FFD, 0x02DC, 0x00AF, 0xFFE3, 0x02D8, 0x02D9, 0x00A8, 0x1FED, 0x0385, 0x1FEE, 0x1FC1, 0x02DA, 0x02DD, 0x1FBD, 0x1FBF, 0x1FCD, 0x1FCE, 0x1FCF, 0x1FFE, 0x1FDD, 0x1FDE, 0x1FDF, 0x00B8, 0x02DB, 0x1FC0, 0x309B, 0x309C, 0xFF3E, 0x1FEF, 0xFF40 }; textNormalizationPropertyDictionary.Add("modifiersymbols", modifiersymbols); int [] currencysymbols = { 0xFE69, 0xFF04, 0xFFE0, 0xFFE1, 0xFFE5, 0xFFE6 }; textNormalizationPropertyDictionary.Add("currencysymbols", currencysymbols); int [] mathsymbols = { 0x207A, 0x208A, 0xFB29, 0xFE62, 0xFF0B, 0x2A74, 0xFE64, 0xFF1C, 0x226E, 0x207C, 0x208C, 0xFE66, 0xFF1D, 0x2A75, 0x2A76, 0x2260, 0xFE65, 0xFF1E, 0x226F, 0xFF5C, 0xFF5E, 0xFFE2, 0xFFE9, 0x219A, 0xFFEA, 0xFFEB, 0x219B, 0xFFEC, 0x21AE, 0x21CF, 0x21CE, 0x1D6DB, 0x1D715, 0x1D74F, 0x1D789, 0x1D7C3, 0x2204, 0x1D6C1, 0x1D6FB, 0x1D735, 0x1D76F, 0x1D7A9, 0x2209, 0x220C, 0x2140, 0x207B, 0x208B, 0x2224, 0x2226, 0x222C, 0x222D, 0x2A0C, 0x222F, 0x2230, 0x2241, 0x2244, 0x2247, 0x2249, 0x226D, 0x2262, 0x2270, 0x2271, 0x2274, 0x2275, 0x2278, 0x2279, 0x2280, 0x2281, 0x22E0, 0x22E1, 0x2284, 0x2285, 0x2288, 0x2289, 0x22E2, 0x22E3, 0x22AC, 0x22AD, 0x22AE, 0x22AF, 0x22EA, 0x22EB, 0x22EC, 0x22ED, 0x2ADC }; textNormalizationPropertyDictionary.Add("mathsymbols", mathsymbols); int [] modifierletter = { 0x037A, 0x0374, 0xFF9E, 0xFF9F, 0xFF70 }; textNormalizationPropertyDictionary.Add("modifierletter", modifierletter); int [] otherletter = { 0xFE70, 0xFE72, 0xFC5E, 0xFE74, 0xFC5F, 0xFE76, 0xFC60, 0xFE78, 0xFC61, 0xFE7A, 0xFC62, 0xFE7C, 0xFC63, 0xFE7E, 0xFE71, 0xFE77, 0xFCF2, 0xFE79, 0xFCF3, 0xFE7B, 0xFCF4, 0xFE7D, 0xFE7F }; textNormalizationPropertyDictionary.Add("otherletter", otherletter); int [] nonspacingmark = { 0x0340, 0x0341, 0x0344, 0x0343 }; textNormalizationPropertyDictionary.Add("nonspacingmark", nonspacingmark); int [] spaceseparator = { 0x00A0, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202F, 0x205F, 0x3000 }; textNormalizationPropertyDictionary.Add("spaceseparator", spaceseparator); int [] decimalnumber = { 0xFF10, 0x1D7CE, 0x1D7D8, 0x1D7E2, 0x1D7EC, 0x1D7F6, 0xFF11, 0x1D7CF, 0x1D7D9, 0x1D7E3, 0x1D7ED, 0x200A, 0x1D7F7, 0xFF12, 0x1D7D0, 0x1D7DA, 0x1D7E4, 0x1D7EE, 0x1D7F8, 0xFF13, 0x1D7D1, 0x1D7DB, 0x1D7E5, 0x1D7EF, 0x1D7F9, 0xFF14, 0x1D7D2, 0x1D7DC, 0x1D7E6, 0x1D7F0, 0x1D7FA, 0xFF15, 0x1D7D3, 0x1D7DD, 0x1D7E7, 0x1D7F1, 0x1D7FB, 0xFF16, 0x1D7D4, 0x1D7DE, 0x1D7E8, 0x1D7F2, 0x1D7FC, 0xFF17, 0x1D7D5, 0x1D7DF, 0x1D7E9, 0x1D7F3, 0x1D7FD, 0xFF18, 0x1D7D6, 0x1D7E0, 0x1D7EA, 0x1D7F4, 0x1D7FE, 0xFF19, 0x1D7D7, 0x1D7E1, 0x1D7EB, 0x1D7F5, 0x1D7FF }; textNormalizationPropertyDictionary.Add("decimalnumber", decimalnumber); int [] othernumber = { 0x2474, 0x247D, 0x247E, 0x247F, 0x2480, 0x2481, 0x2482, 0x2483, 0x2484, 0x2485, 0x2486, 0x2475, 0x2487, 0x2476, 0x2477, 0x2478, 0x2479, 0x247A, 0x247B, 0x247C, 0x2070, 0x2080, 0x24EA, 0x1F101, 0x1F100, 0x2189, 0x00B9, 0x2081, 0x2460, 0x1F102, 0x2488, 0x2469, 0x2491, 0x246A, 0x2492, 0x246B, 0x2493, 0x246C, 0x2494, 0x246D, 0x2495, 0x246E, 0x2496, 0x246F, 0x2497, 0x2470, 0x2498, 0x2499, 0x2472, 0x249A, 0x215F, 0x2152, 0x00BD, 0x2153, 0x00BC, 0x2155, 0x2159, 0x2150, 0x215B, 0x2151, 0x00B2, 0x2082, 0x2461, 0x1F103, 0x2489, 0x2473, 0x249B, 0x3251, 0x3252, 0x3253, 0x3254, 0x3255, 0x3256, 0x3257, 0x3258, 0x3259, 0x2154, 0x2156, 0x00B3, 0x2083, 0x2462, 0x1F104, 0x248A, 0x325A, 0x325B, 0x325C, 0x325D, 0x325E, 0x325F, 0x32B1, 0x32B2, 0x32B3, 0x32B4, 0x00BE, 0x2157, 0x215C, 0x2074, 0x2084, 0x2463, 0x1F105, 0x248B, 0x32B5, 0x32B6, 0x32B7, 0x32B8, 0x32B9, 0x32BA, 0x32BB, 0x32BC, 0x32BD, 0x32BE, 0x2158, 0x2075, 0x2085, 0x2464, 0x1F106, 0x248C, 0x32BF, 0x215A, 0x215D, 0x2076, 0x2086, 0x2465, 0x1F107, 0x248D, 0x2077, 0x2087, 0x2466, 0x1F108, 0x248E, 0x215E, 0x2078, 0x2088, 0x2467, 0x1F109, 0x248F, 0x2079, 0x2089, 0x2468, 0x1F10A, 0x2490 }; textNormalizationPropertyDictionary.Add("othernumber", othernumber); int [] kaithi = { 0x1109A, 0x1109C, 0x110AB }; textNormalizationPropertyDictionary.Add("kaithi", kaithi); int [] balinese = { 0x1B06, 0x1B08, 0x1B0A, 0x1B0C, 0x1B0E, 0x1B12, 0x1B3B, 0x1B3D, 0x1B40, 0x1B41, 0x1B43 }; textNormalizationPropertyDictionary.Add("balinese", balinese); int [] tifinagh = { 0x2D6F }; textNormalizationPropertyDictionary.Add("tifinagh", tifinagh); int [] hiragana = { 0x3094, 0x304C, 0x304E, 0x3050, 0x3052, 0x3054, 0x3056, 0x3058, 0x305A, 0x305C, 0x305E, 0x3060, 0x3062, 0x3065, 0x3067, 0x3069, 0x3070, 0x3071, 0x3073, 0x3074, 0x3076, 0x3077, 0x3079, 0x307A, 0x1F200, 0x307C, 0x307D, 0x309F, 0x309E }; textNormalizationPropertyDictionary.Add("hiragana", hiragana); int [] georgian = { 0x10FC }; textNormalizationPropertyDictionary.Add("georgian", georgian); int [] myanmar = { 0x1026 }; textNormalizationPropertyDictionary.Add("myanmar", myanmar); int [] tibetan = { 0x0F0C, 0x0F69, 0x0F43, 0x0F4D, 0x0F52, 0x0F57, 0x0F5C, 0x0F73, 0x0F75, 0x0F81, 0x0FB9, 0x0F93, 0x0F9D, 0x0FA2, 0x0FA7, 0x0FAC, 0x0F77, 0x0F76, 0x0F79, 0x0F78 }; textNormalizationPropertyDictionary.Add("tibetan", tibetan); int [] lao = { 0x0EDC, 0x0EDD, 0x0EB3 }; textNormalizationPropertyDictionary.Add("lao", lao); int [] th = { 0x0E33 }; textNormalizationPropertyDictionary.Add("th", th); int [] sinhala = { 0x0DDA, 0x0DDC, 0x0DDD, 0x0DDE }; textNormalizationPropertyDictionary.Add("sinhala", sinhala); int [] malayalam = { 0x0D4A, 0x0D4C, 0x0D4B }; textNormalizationPropertyDictionary.Add("malayalam", malayalam); int [] kannada = { 0x0CC0, 0x0CCA, 0x0CCB, 0x0CC7, 0x0CC8 }; textNormalizationPropertyDictionary.Add("kannada", kannada); int [] telugu = { 0x0C48 }; textNormalizationPropertyDictionary.Add("telugu", telugu); int [] ta = { 0x0B94, 0x0BCA, 0x0BCC, 0x0BCB }; textNormalizationPropertyDictionary.Add("ta", ta); int [] oriya = { 0x0B5C, 0x0B5D, 0x0B4B, 0x0B48, 0x0B4C }; textNormalizationPropertyDictionary.Add("oriya", oriya); int [] gurmukhi = { 0x0A59, 0x0A5A, 0x0A5B, 0x0A5E, 0x0A33, 0x0A36 }; textNormalizationPropertyDictionary.Add("gurmukhi", gurmukhi); int [] bengali = { 0x09DC, 0x09DD, 0x09DF, 0x09CB, 0x09CC }; textNormalizationPropertyDictionary.Add("bengali", bengali); int [] devanagari = { 0x0958, 0x0959, 0x095A, 0x095B, 0x095C, 0x095D, 0x0929, 0x095E, 0x095E, 0x0931, 0x0934 }; textNormalizationPropertyDictionary.Add("devanagari", devanagari); int [] he = { 0x2135, 0xFB21, 0xFB2E, 0xFB2F, 0xFB30, 0xFB4F, 0x2136, 0xFB31, 0xFB4C, 0x2137, 0xFB32, 0x2138, 0xFB22, 0xFB33, 0xFB23, 0xFB34, 0xFB4B, 0xFB35, 0xFB36, 0xFB38, 0xFB1D, 0xFB39, 0xFB3A, 0xFB24, 0xFB3B, 0xFB4D, 0xFB25, 0xFB3C, 0xFB26, 0xFB3E, 0xFB40, 0xFB41, 0xFB20, 0xFB43, 0xFB44, 0xFB4E, 0xFB46, 0xFB47, 0xFB27, 0xFB48, 0xFB49, 0xFB2C, 0xFB2D, 0xFB2D, 0xFB2B, 0xFB28, 0xFB4A, 0xFB1F }; textNormalizationPropertyDictionary.Add("he", he); int [] hy = { 0x0587, 0xFB14, 0xFB15, 0xFB17, 0xFB13, 0xFB16 }; textNormalizationPropertyDictionary.Add("hy", hy); int [] cyrillic = { 0x04D0, 0x04D1, 0x04D2, 0x04D3, 0x0403, 0x0453, 0x0400, 0x0450, 0x04D6, 0x04D7, 0x0401, 0x0451, 0x04C1, 0x04C2, 0x04DC, 0x04DD, 0x04DE, 0x04DF, 0x040D, 0x045D, 0x04E2, 0x04E3, 0x0419, 0x0439, 0x04E4, 0x04E5, 0x040C, 0x045C, 0x1D78, 0x04E6, 0x04E7, 0x04EE, 0xFB20, 0x04EF, 0x040E, 0x045E, 0x04F0, 0x04F1, 0x04F2, 0x04F3, 0x04F4, 0x04F5, 0x04F6, 0x04F7, 0x04F8, 0x04F9, 0x04EC, 0x04ED, 0x0407, 0x0457, 0x0476, 0x0477, 0x04DA, 0x04DB, 0x04EA, 0x04EB }; textNormalizationPropertyDictionary.Add("cyrillic", cyrillic); int i = 0; bool isValid = false; codePointsWithDifferentNormalizationForms = new int [othersymbols.Length + modifiersymbols.Length + currencysymbols.Length + mathsymbols.Length + modifierletter.Length + otherletter.Length + nonspacingmark.Length + spaceseparator.Length + decimalnumber.Length + othernumber.Length + kaithi.Length + balinese.Length + tifinagh.Length + hiragana.Length + georgian.Length + myanmar.Length + tibetan.Length + lao.Length + th.Length + sinhala.Length + malayalam.Length + kannada.Length + telugu.Length + ta.Length + oriya.Length + gurmukhi.Length + bengali.Length + devanagari.Length + he.Length + hy.Length + cyrillic.Length]; Dictionary <string, int[]> .ValueCollection valueColl = textNormalizationPropertyDictionary.Values; foreach (int [] values in valueColl) { foreach (int codePoint in values) { if (codePoint >= expectedRange.StartOfUnicodeRange && codePoint <= expectedRange.EndOfUnicodeRange) { codePointsWithDifferentNormalizationForms[i++] = codePoint; isValid = true; } } } Array.Resize(ref codePointsWithDifferentNormalizationForms, i); Array.Sort(codePointsWithDifferentNormalizationForms); return(isValid); }