private static async Task ProcessBlocksFile(IDataSource ucdSource, UnicodeInfoBuilder builder) { using (var reader = new UnicodeDataFileReader(await ucdSource.OpenDataFileAsync(BlocksFileName).ConfigureAwait(false), ';')) { while (reader.MoveToNextLine()) { builder.AddBlockEntry(new UnicodeBlock(UnicodeCodePointRange.Parse(reader.ReadField()), reader.ReadTrimmedField())); } } }
private static async Task ProcessEmojiDataFile(IDataSource emojiSource, UnicodeInfoBuilder builder) { using (var reader = new UnicodeDataFileReader(await emojiSource.OpenDataFileAsync(EmojiDataFileName).ConfigureAwait(false), ';')) { while (reader.MoveToNextLine()) { var range = UnicodeCodePointRange.Parse(reader.ReadTrimmedField()); if (EnumHelper <EmojiProperties> .TryGetNamedValue(reader.ReadTrimmedField(), out var property)) { builder.SetProperties(property, range); } } } }
private static async Task ProcessNameAliasesFile(IDataSource ucdSource, UnicodeInfoBuilder builder) { using (var reader = new UnicodeDataFileReader(await ucdSource.OpenDataFileAsync(NameAliasesFileName).ConfigureAwait(false), ';')) { while (reader.MoveToNextLine()) { var ucd = builder.GetUcd(int.Parse(reader.ReadField(), NumberStyles.HexNumber)); string name = reader.ReadField(); string kindName = reader.ReadField(); if (!EnumHelper <UnicodeNameAliasKind> .TryGetNamedValue(kindName, out var kind)) { throw new InvalidDataException("Unrecognized name alias: " + kindName + ".3"); } ucd.NameAliases.Add(new UnicodeNameAlias(name, kind)); } } }
private static async Task ProcessCjkRadicalsFile(IDataSource ucdSource, UnicodeInfoBuilder builder) { using (var reader = new UnicodeDataFileReader(await ucdSource.OpenDataFileAsync(CjkRadicalsFileName).ConfigureAwait(false), ';')) { int lastReadRadical = 0; while (reader.MoveToNextLine()) { string radicalIndexText = reader.ReadField(); bool isSimplified = radicalIndexText[radicalIndexText.Length - 1] == '\''; int radicalIndex = int.Parse(isSimplified ? radicalIndexText.Substring(0, radicalIndexText.Length - 1) : radicalIndexText); if (isSimplified ? radicalIndex != lastReadRadical : lastReadRadical + 1 != (lastReadRadical = radicalIndex)) { throw new InvalidDataException("Did not expect radical number " + radicalIndexText + "."); } char radicalCodePoint = checked ((char)int.Parse(reader.ReadTrimmedField(), NumberStyles.HexNumber)); char characterCodePoint = checked ((char)int.Parse(reader.ReadTrimmedField(), NumberStyles.HexNumber)); if (!isSimplified && (radicalCodePoint & 0x8000) != 0) { throw new InvalidOperationException("Did not expect the radical code point to be higher than U+8000 for radical " + radicalIndex.ToString() + "."); } if (isSimplified) { builder.SetRadicalInfo(radicalIndex, UpdateRadicalData(builder.GetRadicalInfo(radicalIndex), radicalCodePoint, characterCodePoint)); } else { builder.SetRadicalInfo(radicalIndex, new CjkRadicalData(radicalCodePoint, characterCodePoint)); } } if (lastReadRadical != UnicodeInfoBuilder.CjkRadicalCount) { throw new InvalidOperationException("There was not enough data for the 214 CJK radicals."); } } }
private static async Task ProcessUnicodeDataFile(IDataSource ucdSource, UnicodeInfoBuilder builder) { using (var reader = new UnicodeDataFileReader(await ucdSource.OpenDataFileAsync(UnicodeDataFileName).ConfigureAwait(false), ';')) { int rangeStartCodePoint = -1; while (reader.MoveToNextLine()) { var codePoint = new UnicodeCodePointRange(int.Parse(reader.ReadField(), NumberStyles.HexNumber)); string name = reader.ReadField(); if (!string.IsNullOrEmpty(name) && name[0] == '<' && name[name.Length - 1] == '>') { if (name.EndsWith(", First>", StringComparison.OrdinalIgnoreCase)) { if (rangeStartCodePoint >= 0) { throw new InvalidDataException("Invalid range data in UnicodeData.txt."); } rangeStartCodePoint = codePoint.FirstCodePoint; continue; } else if (name.EndsWith(", Last>", StringComparison.OrdinalIgnoreCase)) { if (rangeStartCodePoint < 0) { throw new InvalidDataException("Invalid range data in UnicodeData.txt."); } codePoint = new UnicodeCodePointRange(rangeStartCodePoint, codePoint.LastCodePoint); name = name.Substring(1, name.Length - 8).ToUpperInvariant(); // Upper-case the name in order to respect unicode naming scheme. (Spec says all names are uppercase ASCII) rangeStartCodePoint = -1; } else if (name == "<control>") // Ignore the name of the property for these code points, as it should really be empty by the spec. { // For control characters, we can derive a character label in of the form <control-NNNN>, which is not the character name. name = null; } else { throw new InvalidDataException("Unexpected code point name tag: " + name + "."); } } else if (rangeStartCodePoint >= 0) { throw new InvalidDataException("Invalid range data in UnicodeData.txt."); } // NB: Fields 10 and 11 are deemed obsolete. Field 11 should always be empty, and will be ignored here. var characterData = new UnicodeCharacterDataBuilder(codePoint) { Name = NullIfEmpty(name), Category = UnicodeCategoryInfo.FromShortName(reader.ReadField()).Category, CanonicalCombiningClass = (CanonicalCombiningClass)byte.Parse(reader.ReadField()), }; if (EnumHelper <BidirectionalClass> .TryGetNamedValue(reader.ReadField(), out var bidirectionalClass)) { characterData.BidirectionalClass = bidirectionalClass; } else { throw new InvalidDataException(string.Format("Missing Bidi_Class property for code point(s) {0}.", codePoint)); } characterData.CharacterDecompositionMapping = CharacterDecompositionMapping.Parse(NullIfEmpty(reader.ReadField())); string numericDecimalField = NullIfEmpty(reader.ReadField()); string numericDigitField = NullIfEmpty(reader.ReadField()); string numericNumericField = NullIfEmpty(reader.ReadField()); characterData.BidirectionalMirrored = reader.ReadField() == "Y"; characterData.OldName = NullIfEmpty(reader.ReadField()); reader.SkipField(); characterData.SimpleUpperCaseMapping = ParseSimpleCaseMapping(reader.ReadField()); characterData.SimpleLowerCaseMapping = ParseSimpleCaseMapping(reader.ReadField()); characterData.SimpleTitleCaseMapping = ParseSimpleCaseMapping(reader.ReadField()); // Handle Numeric_Type & Numeric_Value: // If field 6 is set, fields 7 and 8 should have the same value, and Numeric_Type is Decimal. // If field 6 is not set but field 7 is set, field 8 should be set and have the same value. Then, the type is Digit. // If field 6 and 7 are not set, but field 8 is set, then Numeric_Type is Numeric. if (numericNumericField != null) { characterData.NumericValue = UnicodeRationalNumber.Parse(numericNumericField); if (numericDigitField != null) { if (numericDigitField != numericNumericField) { throw new InvalidDataException("Invalid value for field 7 of code point " + characterData.CodePointRange.ToString() + "."); } if (numericDecimalField != null) { if (numericDecimalField != numericDigitField) { throw new InvalidDataException("Invalid value for field 6 of code point " + characterData.CodePointRange.ToString() + "."); } characterData.NumericType = UnicodeNumericType.Decimal; } else { characterData.NumericType = UnicodeNumericType.Digit; } } else { characterData.NumericType = UnicodeNumericType.Numeric; } } builder.Insert(characterData); } } }
public UnihanDataFileReader(Stream stream, bool leaveOpen) { reader = new UnicodeDataFileReader(stream, '\t', leaveOpen); }