예제 #1
0
 private static async Task ProcessBlocksFile(IDataSource ucdSource, UnicodeInfoBuilder builder)
 {
     using (var reader = new UnicodeDataFileReader(await ucdSource.OpenDataFileAsync(BlocksFileName).ConfigureAwait(false), ';'))
     {
         while (reader.MoveToNextLine())
         {
             builder.AddBlockEntry(new UnicodeBlock(UnicodeCodePointRange.Parse(reader.ReadField()), reader.ReadTrimmedField()));
         }
     }
 }
예제 #2
0
 private static async Task ProcessEmojiDataFile(IDataSource emojiSource, UnicodeInfoBuilder builder)
 {
     using (var reader = new UnicodeDataFileReader(await emojiSource.OpenDataFileAsync(EmojiDataFileName).ConfigureAwait(false), ';'))
     {
         while (reader.MoveToNextLine())
         {
             var range = UnicodeCodePointRange.Parse(reader.ReadTrimmedField());
             if (EnumHelper <EmojiProperties> .TryGetNamedValue(reader.ReadTrimmedField(), out var property))
             {
                 builder.SetProperties(property, range);
             }
         }
     }
 }
예제 #3
0
        public static async Task <UnicodeInfoBuilder> BuildDataAsync(IDataSource ucdSource, IDataSource unihanSource, IDataSource emojiSource)
        {
            var builder = new UnicodeInfoBuilder(await ReadUnicodeVersionAsync(ucdSource).ConfigureAwait(false));

            await ProcessUnicodeDataFile(ucdSource, builder).ConfigureAwait(false);
            await ProcessPropListFile(ucdSource, builder).ConfigureAwait(false);
            await ProcessDerivedCorePropertiesFile(ucdSource, builder).ConfigureAwait(false);
            await ProcessEmojiDataFile(emojiSource, builder).ConfigureAwait(false);
            await ProcessCjkRadicalsFile(ucdSource, builder).ConfigureAwait(false);
            await ProcessNameAliasesFile(ucdSource, builder).ConfigureAwait(false);
            await ProcessNamesListFile(ucdSource, builder).ConfigureAwait(false);
            await ProcessBlocksFile(ucdSource, builder).ConfigureAwait(false);
            await ProcessUnihanReadings(unihanSource, builder).ConfigureAwait(false);
            await ProcessUnihanVariants(unihanSource, builder).ConfigureAwait(false);
            await ProcessUnihanNumericValues(unihanSource, builder).ConfigureAwait(false);
            await ProcessUnihanIrgSources(unihanSource, builder).ConfigureAwait(false);

            return(builder);
        }
예제 #4
0
        private static async Task ProcessNameAliasesFile(IDataSource ucdSource, UnicodeInfoBuilder builder)
        {
            using (var reader = new UnicodeDataFileReader(await ucdSource.OpenDataFileAsync(NameAliasesFileName).ConfigureAwait(false), ';'))
            {
                while (reader.MoveToNextLine())
                {
                    var ucd = builder.GetUcd(int.Parse(reader.ReadField(), NumberStyles.HexNumber));

                    string name     = reader.ReadField();
                    string kindName = reader.ReadField();

                    if (!EnumHelper <UnicodeNameAliasKind> .TryGetNamedValue(kindName, out var kind))
                    {
                        throw new InvalidDataException("Unrecognized name alias: " + kindName + ".3");
                    }

                    ucd.NameAliases.Add(new UnicodeNameAlias(name, kind));
                }
            }
        }
예제 #5
0
        private static async Task ProcessCjkRadicalsFile(IDataSource ucdSource, UnicodeInfoBuilder builder)
        {
            using (var reader = new UnicodeDataFileReader(await ucdSource.OpenDataFileAsync(CjkRadicalsFileName).ConfigureAwait(false), ';'))
            {
                int lastReadRadical = 0;

                while (reader.MoveToNextLine())
                {
                    string radicalIndexText = reader.ReadField();
                    bool   isSimplified     = radicalIndexText[radicalIndexText.Length - 1] == '\'';
                    int    radicalIndex     = int.Parse(isSimplified ? radicalIndexText.Substring(0, radicalIndexText.Length - 1) : radicalIndexText);

                    if (isSimplified ? radicalIndex != lastReadRadical : lastReadRadical + 1 != (lastReadRadical = radicalIndex))
                    {
                        throw new InvalidDataException("Did not expect radical number " + radicalIndexText + ".");
                    }

                    char radicalCodePoint   = checked ((char)int.Parse(reader.ReadTrimmedField(), NumberStyles.HexNumber));
                    char characterCodePoint = checked ((char)int.Parse(reader.ReadTrimmedField(), NumberStyles.HexNumber));

                    if (!isSimplified && (radicalCodePoint & 0x8000) != 0)
                    {
                        throw new InvalidOperationException("Did not expect the radical code point to be higher than U+8000 for radical " + radicalIndex.ToString() + ".");
                    }

                    if (isSimplified)
                    {
                        builder.SetRadicalInfo(radicalIndex, UpdateRadicalData(builder.GetRadicalInfo(radicalIndex), radicalCodePoint, characterCodePoint));
                    }
                    else
                    {
                        builder.SetRadicalInfo(radicalIndex, new CjkRadicalData(radicalCodePoint, characterCodePoint));
                    }
                }

                if (lastReadRadical != UnicodeInfoBuilder.CjkRadicalCount)
                {
                    throw new InvalidOperationException("There was not enough data for the 214 CJK radicals.");
                }
            }
        }
예제 #6
0
        private static async Task ProcessUnicodeDataFile(IDataSource ucdSource, UnicodeInfoBuilder builder)
        {
            using (var reader = new UnicodeDataFileReader(await ucdSource.OpenDataFileAsync(UnicodeDataFileName).ConfigureAwait(false), ';'))
            {
                int rangeStartCodePoint = -1;

                while (reader.MoveToNextLine())
                {
                    var codePoint = new UnicodeCodePointRange(int.Parse(reader.ReadField(), NumberStyles.HexNumber));

                    string name = reader.ReadField();

                    if (!string.IsNullOrEmpty(name) && name[0] == '<' && name[name.Length - 1] == '>')
                    {
                        if (name.EndsWith(", First>", StringComparison.OrdinalIgnoreCase))
                        {
                            if (rangeStartCodePoint >= 0)
                            {
                                throw new InvalidDataException("Invalid range data in UnicodeData.txt.");
                            }

                            rangeStartCodePoint = codePoint.FirstCodePoint;

                            continue;
                        }
                        else if (name.EndsWith(", Last>", StringComparison.OrdinalIgnoreCase))
                        {
                            if (rangeStartCodePoint < 0)
                            {
                                throw new InvalidDataException("Invalid range data in UnicodeData.txt.");
                            }

                            codePoint = new UnicodeCodePointRange(rangeStartCodePoint, codePoint.LastCodePoint);

                            name = name.Substring(1, name.Length - 8).ToUpperInvariant();                             // Upper-case the name in order to respect unicode naming scheme. (Spec says all names are uppercase ASCII)

                            rangeStartCodePoint = -1;
                        }
                        else if (name == "<control>")                         // Ignore the name of the property for these code points, as it should really be empty by the spec.
                        {
                            // For control characters, we can derive a character label in of the form <control-NNNN>, which is not the character name.
                            name = null;
                        }
                        else
                        {
                            throw new InvalidDataException("Unexpected code point name tag: " + name + ".");
                        }
                    }
                    else if (rangeStartCodePoint >= 0)
                    {
                        throw new InvalidDataException("Invalid range data in UnicodeData.txt.");
                    }

                    // NB: Fields 10 and 11 are deemed obsolete. Field 11 should always be empty, and will be ignored here.
                    var characterData = new UnicodeCharacterDataBuilder(codePoint)
                    {
                        Name     = NullIfEmpty(name),
                        Category = UnicodeCategoryInfo.FromShortName(reader.ReadField()).Category,
                        CanonicalCombiningClass = (CanonicalCombiningClass)byte.Parse(reader.ReadField()),
                    };

                    if (EnumHelper <BidirectionalClass> .TryGetNamedValue(reader.ReadField(), out var bidirectionalClass))
                    {
                        characterData.BidirectionalClass = bidirectionalClass;
                    }
                    else
                    {
                        throw new InvalidDataException(string.Format("Missing Bidi_Class property for code point(s) {0}.", codePoint));
                    }

                    characterData.CharacterDecompositionMapping = CharacterDecompositionMapping.Parse(NullIfEmpty(reader.ReadField()));

                    string numericDecimalField = NullIfEmpty(reader.ReadField());
                    string numericDigitField   = NullIfEmpty(reader.ReadField());
                    string numericNumericField = NullIfEmpty(reader.ReadField());

                    characterData.BidirectionalMirrored = reader.ReadField() == "Y";
                    characterData.OldName = NullIfEmpty(reader.ReadField());
                    reader.SkipField();
                    characterData.SimpleUpperCaseMapping = ParseSimpleCaseMapping(reader.ReadField());
                    characterData.SimpleLowerCaseMapping = ParseSimpleCaseMapping(reader.ReadField());
                    characterData.SimpleTitleCaseMapping = ParseSimpleCaseMapping(reader.ReadField());

                    // Handle Numeric_Type & Numeric_Value:
                    // If field 6 is set, fields 7 and 8 should have the same value, and Numeric_Type is Decimal.
                    // If field 6 is not set but field 7 is set, field 8 should be set and have the same value. Then, the type is Digit.
                    // If field 6 and 7 are not set, but field 8 is set, then Numeric_Type is Numeric.
                    if (numericNumericField != null)
                    {
                        characterData.NumericValue = UnicodeRationalNumber.Parse(numericNumericField);

                        if (numericDigitField != null)
                        {
                            if (numericDigitField != numericNumericField)
                            {
                                throw new InvalidDataException("Invalid value for field 7 of code point " + characterData.CodePointRange.ToString() + ".");
                            }

                            if (numericDecimalField != null)
                            {
                                if (numericDecimalField != numericDigitField)
                                {
                                    throw new InvalidDataException("Invalid value for field 6 of code point " + characterData.CodePointRange.ToString() + ".");
                                }
                                characterData.NumericType = UnicodeNumericType.Decimal;
                            }
                            else
                            {
                                characterData.NumericType = UnicodeNumericType.Digit;
                            }
                        }
                        else
                        {
                            characterData.NumericType = UnicodeNumericType.Numeric;
                        }
                    }

                    builder.Insert(characterData);
                }
            }
        }
예제 #7
0
        private static async Task ProcessUnihanIrgSources(IDataSource unihanDataSource, UnicodeInfoBuilder builder)
        {
            using (var reader = new UnihanDataFileReader(await unihanDataSource.OpenDataFileAsync(UnihanIrgSourcesFileName).ConfigureAwait(false)))
            {
                while (reader.Read())
                {
                    switch (reader.PropertyName)
                    {
                    case UnihanProperty.RSUnicode:
                        var entry  = builder.GetUnihan(reader.CodePoint);
                        var values = reader.PropertyValue.Split(' ');

                        foreach (string value in values)
                        {
                            bool isSimplified = false;
                            int  index;

                            for (int i = 0; i < value.Length; ++i)
                            {
                                switch (value[i])
                                {
                                case '\'':
                                    isSimplified = true;
                                    goto case '.';

                                case '.':
                                    index = i;
                                    goto SeparatorFound;
                                }
                            }
                            throw new InvalidDataException("Failed to decode value for kRSUnicode / Unicode_Radical_Stroke.");

                            SeparatorFound :;
                            entry.UnicodeRadicalStrokeCounts.Add(new UnicodeRadicalStrokeCount(byte.Parse(value.Substring(0, index), NumberStyles.None), sbyte.Parse(value.Substring(index + (isSimplified ? 2 : 1)), NumberStyles.AllowLeadingSign), isSimplified));
                        }
                        break;

                    default:
                        // Ignore unhandled properties for now.
                        break;
                    }
                }
            }
        }
예제 #8
0
        private static async Task ProcessUnihanNumericValues(IDataSource unihanDataSource, UnicodeInfoBuilder builder)
        {
            using (var reader = new UnihanDataFileReader(await unihanDataSource.OpenDataFileAsync(UnihanNumericValuesFileName).ConfigureAwait(false)))
            {
                while (reader.Read())
                {
                    var entry = builder.GetUnihan(reader.CodePoint);

                    switch (reader.PropertyName)
                    {
                    case UnihanProperty.AccountingNumeric:
                        entry.NumericType = UnihanNumericType.Accounting;
                        break;

                    case UnihanProperty.OtherNumeric:
                        entry.NumericType = UnihanNumericType.Other;
                        break;

                    case UnihanProperty.PrimaryNumeric:
                        entry.NumericType = UnihanNumericType.Primary;
                        break;

                    default:
                        throw new InvalidDataException("Unrecognized property name: " + reader.PropertyName + ".");
                    }

                    entry.NumericValue = long.Parse(reader.PropertyValue);
                }
            }
        }
예제 #9
0
        private static async Task ProcessUnihanVariants(IDataSource unihanDataSource, UnicodeInfoBuilder builder)
        {
            using (var reader = new UnihanDataFileReader(await unihanDataSource.OpenDataFileAsync(UnihanVariantsFileName).ConfigureAwait(false)))
            {
                while (reader.Read())
                {
                    // This statement is used to skip unhandled properties entirely.
                    switch (reader.PropertyName)
                    {
                    case UnihanProperty.SimplifiedVariant:
                    case UnihanProperty.TraditionalVariant:
                        break;

                    default:
                        // Ignore unhandled properties for now.
                        continue;
                    }

                    var entry = builder.GetUnihan(reader.CodePoint);

                    switch (reader.PropertyName)
                    {
                    case UnihanProperty.SimplifiedVariant:
                        entry.SimplifiedVariant = char.ConvertFromUtf32(HexCodePoint.ParsePrefixed(reader.PropertyValue));
                        break;

                    case UnihanProperty.TraditionalVariant:
                        entry.TraditionalVariant = char.ConvertFromUtf32(HexCodePoint.ParsePrefixed(reader.PropertyValue));
                        break;

                    default:
                        throw new InvalidOperationException();
                    }
                }
            }
        }
예제 #10
0
        private static async Task ProcessUnihanReadings(IDataSource unihanDataSource, UnicodeInfoBuilder builder)
        {
            using (var reader = new UnihanDataFileReader(await unihanDataSource.OpenDataFileAsync(UnihanReadingsFileName).ConfigureAwait(false)))
            {
                while (reader.Read())
                {
                    // This statement is used to skip unhandled properties entirely.
                    switch (reader.PropertyName)
                    {
                    case UnihanProperty.Definition:
                    case UnihanProperty.Mandarin:
                    case UnihanProperty.Cantonese:
                    case UnihanProperty.JapaneseKun:
                    case UnihanProperty.JapaneseOn:
                    case UnihanProperty.Korean:
                    case UnihanProperty.Hangul:
                    case UnihanProperty.Vietnamese:
                        break;

                    default:
                        // Ignore unhandled properties for now.
                        continue;
                    }

                    // This entry will only be created if there is meaningful data.
                    var entry = builder.GetUnihan(reader.CodePoint);

                    switch (reader.PropertyName)
                    {
                    case UnihanProperty.Definition:
                        entry.Definition = reader.PropertyValue;
                        break;

                    case UnihanProperty.Mandarin:
                        entry.MandarinReading = reader.PropertyValue;
                        break;

                    case UnihanProperty.Cantonese:
                        entry.CantoneseReading = reader.PropertyValue;
                        break;

                    case UnihanProperty.JapaneseKun:
                        entry.JapaneseKunReading = reader.PropertyValue;
                        break;

                    case UnihanProperty.JapaneseOn:
                        entry.JapaneseOnReading = reader.PropertyValue;
                        break;

                    case UnihanProperty.Korean:
                        entry.KoreanReading = reader.PropertyValue;
                        break;

                    case UnihanProperty.Hangul:
                        entry.HangulReading = reader.PropertyValue;
                        break;

                    case UnihanProperty.Vietnamese:
                        entry.VietnameseReading = reader.PropertyValue;
                        break;

                    default:
                        throw new InvalidOperationException();
                    }
                }
            }
        }
예제 #11
0
        private static async Task ProcessNamesListFile(IDataSource ucdSource, UnicodeInfoBuilder builder)
        {
            using (var reader = new StreamReader(await ucdSource.OpenDataFileAsync(NamesListFileName).ConfigureAwait(false), Encoding.UTF8, false))
            {
                string line;
                var    characterData = null as UnicodeCharacterDataBuilder;

                while ((line = reader.ReadLine()) != null)
                {
                    if (line.Length == 0)
                    {
                        characterData = null;
                        continue;
                    }

                    if (characterData != null && line.Length > 3 && line[0] == '\t')
                    {
                        if (line[1] == 'x')
                        {
                            // We should get at least 7 characters for a valid line: <tab> "x" <space> [0-9A-Z][0-9A-Z][0-9A-Z][0-9A-Z]
                            if (line.Length < 7)
                            {
                                characterData = null;
                                continue;
                            }
                            if (line[2] != ' ')
                            {
                                throw new InvalidDataException();
                            }

                            int length;

                            if (line[3].IsHexDigit())
                            {
                                length = line.IndexOf(' ', 3);
                                if (length < 0)
                                {
                                    length = line.Length;
                                }
                                length -= 3;

                                characterData.CrossRerefences.Add(int.Parse(line.Substring(3, length), NumberStyles.HexNumber));
                            }
                            else if (line[3] == '(')
                            {
                                bool hasBrackets     = line[4] == '<';
                                int  codePointOffset = line.IndexOf(hasBrackets ? "> - " : "- ", 4);

                                if (codePointOffset < 0)
                                {
                                    throw new InvalidDataException();
                                }
                                codePointOffset += hasBrackets ? 4 : 2;

                                length = line.IndexOf(')', codePointOffset);
                                if (length < 0)
                                {
                                    throw new InvalidDataException();
                                }
                                length -= codePointOffset;

                                characterData.CrossRerefences.Add(int.Parse(line.Substring(codePointOffset, length), NumberStyles.HexNumber));
                            }
                            else
                            {
                                throw new InvalidDataException();
                            }
                        }
                        continue;
                    }

                    if (line[0].IsHexDigit())
                    {
                        int codePoint = int.Parse(line.Substring(0, line.IndexOf('\t')), NumberStyles.HexNumber);
                        // This may return null, but for now, we will ignore code points that are not defined in UnicodeData.txt.
                        characterData = builder.GetUcd(codePoint);
                        // There should be no NamesList.txt entries for code points defined in a range.
                        if (characterData != null && !characterData.CodePointRange.IsSingleCodePoint)
                        {
                            // The only exception to this rule will be when we added the "Noncharacter_Code_Point" property to a few ranges, and we will ignore those.
                            if ((characterData.ContributoryProperties & ContributoryProperties.NonCharacterCodePoint) != 0)
                            {
                                characterData = null;
                            }
                            else
                            {
                                throw new InvalidDataException("Did not expect an NamesList.txt entry for U+" + codePoint.ToString("X4") + ".");
                            }
                        }
                        continue;
                    }

                    switch (line[0])
                    {
                    case '@':
                    case ';':
                    case '\t':
                        characterData = null;
                        break;

                    default:
                        throw new InvalidDataException("Unrecognized data in NamesList.txt.");
                    }
                }
            }
        }