private static void GetMatchingPDI(byte[] types, out int[] outMatchingPDI, out int[] outMatchingIsolateInitiator) { int[] matchingPDI = new int[types.Length]; int[] matchingIsolateInitiator = new int[types.Length]; // Scan for isolate initiator for (int i = 0; i < types.Length; i++) { var cct = (BidiClass)types[i]; if (cct == BidiClass.LRI || cct == BidiClass.RLI || cct == BidiClass.FSI) { int counter = 1; bool hasMatchingPDI = false; // Scan the text following isolate initiator till end of paragraph for (int j = i + 1; j < types.Length; j++) { BidiClass nct = (BidiClass)types[j]; if (nct == BidiClass.LRI || nct == BidiClass.RLI || nct == BidiClass.FSI) // Increment counter at every isolate initiator { counter++; } else if (nct == BidiClass.PDI) // Decrement counter at every PDI { counter--; if (counter == 0) // BD9 bullet 3. Stop when counter is 0 { hasMatchingPDI = true; matchingPDI[i] = j; // Matching PDI found matchingIsolateInitiator[j] = i; break; } } } if (!hasMatchingPDI) { matchingPDI[i] = types.Length; } } else // Other characters matchingPDI are set to -1 { matchingPDI[i] = -1; matchingIsolateInitiator[i] = -1; } } outMatchingPDI = matchingPDI; outMatchingIsolateInitiator = matchingIsolateInitiator; }
/// <summary> /// Reads DerivedBidiClass.txt and parses each entry in that file. /// </summary> private static Dictionary<int, BidiClass> ProcessDerivedBidiClassFile() { using Stream stream = Resources.OpenResource(Resources.DerivedBidiClass); using StreamReader reader = new StreamReader(stream); Dictionary<int, BidiClass> dict = new Dictionary<int, BidiClass>(); string thisLine; while ((thisLine = reader.ReadLine()) != null) { if (PropsFileEntry.TryParseLine(thisLine, out PropsFileEntry value)) { BidiClass bidiClass = BidiClassMap[value.PropName]; for (int i = value.FirstCodePoint; i <= value.LastCodePoint /* inclusive */; i++) { dict.Add(i, bidiClass); } } } return dict; }
private CodePoint(uint value, string name, GeneralCategory generalCategory, byte canonicalCombiningClass, BidiClass bidiClass, bool bidiMirrored, string unicode1Name, string isoComment, uint?simpleUppercaseMappingValue, uint?simpleLowercaseMappingValue, uint?simpleTitlecasemappingValue) { this._value = value; this._name = name; this._generalCategory = generalCategory; this._canonicalCombiningClass = canonicalCombiningClass; this._bidiClass = bidiClass; this._bidiMirrored = bidiMirrored; this._unicode1Name = unicode1Name; this._isoComment = isoComment; this._simpleUppercaseMappingValue = simpleUppercaseMappingValue; this._simpleLowercaseMappingValue = simpleLowercaseMappingValue; this._simpleTitlecaseMappingValue = simpleTitlecasemappingValue; }
public static void LoadUnicodeData() { const string UnicodeDataFileName = "UnicodeData.txt"; const string SpecialCasingFileName = "SpecialCasing.txt"; GetFile(UnicodeDataFileName); GetFile(SpecialCasingFileName); SortedList <uint, CodePoint> codePointsByValue = CodePoint.codePointsByValue; Dictionary <uint, CodePoint[]> uppercaseMappings = CodePoint.uppercaseMappings; Dictionary <uint, CodePoint[]> lowercaseMappings = CodePoint.lowercaseMappings; Dictionary <uint, CodePoint[]> titlecaseMappings = CodePoint.titlecaseMappings; char[] spaceArray = new char[] { ' ' }; char[] semicolonArray = new char[] { ';' }; #region Process UnicodeData file { #region Lookup dictionaries Dictionary <string, GeneralCategory> generalCategoryLookup; { GeneralCategory[] generalCategoryValues = (GeneralCategory[])Enum.GetValues(typeof(GeneralCategory)); generalCategoryLookup = new Dictionary <string, GeneralCategory>(generalCategoryValues.Length, StringComparer.Ordinal); for (int i = 0; i < generalCategoryValues.Length; i++) { GeneralCategory generalCategoryValue = generalCategoryValues[i]; generalCategoryLookup[generalCategoryValue.ToString("G")] = generalCategoryValue; } } Dictionary <string, BidiClass> bidiClassLookup; { BidiClass[] bidiClassValues = (BidiClass[])Enum.GetValues(typeof(BidiClass)); bidiClassLookup = new Dictionary <string, BidiClass>(bidiClassValues.Length, StringComparer.Ordinal); for (int i = 0; i < bidiClassValues.Length; i++) { BidiClass bidiClassValue = bidiClassValues[i]; bidiClassLookup[bidiClassValue.ToString("G")] = bidiClassValue; } } #endregion // Lookup dictionaries string[] unicodeDataLines = File.ReadAllLines(UnicodeDataFileName, Encoding.UTF8); for (int i = 0; i < unicodeDataLines.Length; i++) { string unicodeDataLine = unicodeDataLines[i]; if (!string.IsNullOrEmpty(unicodeDataLine) && unicodeDataLine[0] != '#') { string[] unicodeDataTokens = unicodeDataLine.Split(semicolonArray); Debug.Assert(unicodeDataTokens.Length >= 15); const int ValueIndex = 0; const int NameIndex = 1; const int GeneralCategoryIndex = 2; const int CanonicalCombiningClassIndex = 3; const int BidiClassIndex = 4; // 5, 6, 7, 8 omitted for the moment const int BidiMirroredIndex = 9; const int Unicode1NameIndex = 10; const int IsoCommentIndex = 11; const int SimpleUppercaseMappingIndex = 12; const int SimpleLowercaseMappingIndex = 13; const int SimpleTitlecaseMappingIndex = 14; uint value = uint.Parse(unicodeDataTokens[ValueIndex], NumberStyles.HexNumber, NumberFormatInfo.InvariantInfo); string name = unicodeDataTokens[NameIndex]; if (string.IsNullOrEmpty(name)) { name = NameDefault; } GeneralCategory generalCategory; { string generalCategoryString = unicodeDataTokens[GeneralCategoryIndex]; if (string.IsNullOrEmpty(generalCategoryString)) { generalCategory = GeneralCategory.Cn; } else { generalCategory = generalCategoryLookup[generalCategoryString]; } } byte canonicalCombiningClass; { string canonicalCombiningClassString = unicodeDataTokens[CanonicalCombiningClassIndex]; if (string.IsNullOrEmpty(canonicalCombiningClassString)) { canonicalCombiningClass = 0; } else { canonicalCombiningClass = byte.Parse(canonicalCombiningClassString, NumberStyles.Integer, NumberFormatInfo.InvariantInfo); } } BidiClass bidiClass; { string bidiClassString = unicodeDataTokens[BidiClassIndex]; if (string.IsNullOrEmpty(bidiClassString)) { bidiClass = BidiClass.Invalid; } else { bidiClass = bidiClassLookup[bidiClassString]; } } // 5, 6, 7, 8 omitted for the moment bool bidiMirrored = (unicodeDataTokens[BidiMirroredIndex] == "Y"); string unicode1Name = unicodeDataTokens[Unicode1NameIndex]; if (string.IsNullOrEmpty(unicode1Name)) { unicode1Name = null; } string isoComment = unicodeDataTokens[IsoCommentIndex]; if (string.IsNullOrEmpty(isoComment)) { isoComment = null; } uint?simpleUppercaseMapping; { string simpleUppercaseMappingString = unicodeDataTokens[SimpleUppercaseMappingIndex]; if (string.IsNullOrEmpty(simpleUppercaseMappingString)) { simpleUppercaseMapping = null; } else { simpleUppercaseMapping = uint.Parse(simpleUppercaseMappingString, NumberStyles.HexNumber, NumberFormatInfo.InvariantInfo); if (simpleUppercaseMapping.Value == value) { simpleUppercaseMapping = null; } } } uint?simpleLowercaseMapping; { string simpleLowercaseMappingString = unicodeDataTokens[SimpleLowercaseMappingIndex]; if (string.IsNullOrEmpty(simpleLowercaseMappingString)) { simpleLowercaseMapping = null; } else { simpleLowercaseMapping = uint.Parse(simpleLowercaseMappingString, NumberStyles.HexNumber, NumberFormatInfo.InvariantInfo); if (simpleLowercaseMapping.Value == value) { simpleLowercaseMapping = null; } } } uint?simpleTitlecaseMapping; { string simpleTitlecaseMappingString = unicodeDataTokens[SimpleTitlecaseMappingIndex]; if (string.IsNullOrEmpty(simpleTitlecaseMappingString)) { simpleTitlecaseMapping = null; } else { simpleTitlecaseMapping = uint.Parse(simpleTitlecaseMappingString, NumberStyles.HexNumber, NumberFormatInfo.InvariantInfo); if (simpleTitlecaseMapping.Value == value) { simpleTitlecaseMapping = null; } } } codePointsByValue[value] = new CodePoint(value, name, generalCategory, canonicalCombiningClass, bidiClass, bidiMirrored, unicode1Name, isoComment, simpleUppercaseMapping, simpleLowercaseMapping, simpleTitlecaseMapping); } } } #endregion // Process UnicodeData file #region Process SpecialCasing file { List <CodePoint> mappingCodePoints = new List <CodePoint>(); string[] specialCasingLines = File.ReadAllLines(SpecialCasingFileName, Encoding.UTF8); for (int i = 0; i < specialCasingLines.Length; i++) { string specialCasingLine = specialCasingLines[i]; if (!string.IsNullOrEmpty(specialCasingLine)) { int commentStartIndex = specialCasingLine.IndexOf('#'); if (commentStartIndex >= 0) { specialCasingLine = specialCasingLine.Remove(commentStartIndex).Trim(spaceArray); if (string.IsNullOrEmpty(specialCasingLine)) { continue; } } string[] specialCasingTokens = specialCasingLine.Split(semicolonArray); Debug.Assert(specialCasingTokens.Length >= 4); const int ValueIndex = 0; const int LowerMappingIndex = 1; const int TitleMappingIndex = 2; const int UpperMappingIndex = 3; const int ConditionIndex = 4; uint value = uint.Parse(specialCasingTokens[ValueIndex], NumberStyles.HexNumber, NumberFormatInfo.InvariantInfo); if (specialCasingTokens.Length >= 5 && !string.IsNullOrEmpty(specialCasingTokens[ConditionIndex].Trim(spaceArray))) { // We don't want any conditional mappings continue; } ProcessTokenStringForCodePoints(value, specialCasingTokens[LowerMappingIndex], spaceArray, mappingCodePoints, lowercaseMappings); ProcessTokenStringForCodePoints(value, specialCasingTokens[TitleMappingIndex], spaceArray, mappingCodePoints, titlecaseMappings); ProcessTokenStringForCodePoints(value, specialCasingTokens[UpperMappingIndex], spaceArray, mappingCodePoints, uppercaseMappings); } } } #endregion // Process SpecialCasing file }
// Override types list from start up to (not including) limit to newType private static void SetRunTypes(this IsolatingRunSequence sequence, int start, int limit, BidiClass newType) { for (int i = start; i < limit; i++) { sequence.types[i] = (byte)newType; } }
// 3.3.4 Resolve Neutral Types // In final results all NIs are resolved to R or L private static void ResolveNeutrals(this IsolatingRunSequence sequence) { // TODO: N0 rule (Paired Brackets algorithm) // N1 // Sequence of NIs will resolve to surrounding "strong" type if text on both sides was of same direction. // sos and eos are used at run sequence boundaries. AN and EN will resolve type to R. var typesSet = new BidiClass[] { BidiClass.B, BidiClass.S, BidiClass.WS, BidiClass.ON, BidiClass.LRI, BidiClass.RLI, BidiClass.FSI, BidiClass.PDI }; for (int i = 0; i < sequence.length; i++) { var ct = (BidiClass)sequence.types[i]; bool isNI = ct == BidiClass.B || ct == BidiClass.S || ct == BidiClass.WS || ct == BidiClass.ON || ct == BidiClass.LRI || ct == BidiClass.RLI || ct == BidiClass.FSI || ct == BidiClass.PDI; if (isNI) { BidiClass leadType = 0; BidiClass trailType = 0; int start = i; int runEnd = sequence.GetRunLimit(start, sequence.length, typesSet); // Start of matching NI if (start == 0) // Start boundary, lead type = sos { leadType = sequence.sos; } else { leadType = (BidiClass)sequence.types[start - 1]; if (leadType == BidiClass.AN || leadType == BidiClass.EN) // Leading AN, EN resolve type to R { leadType = BidiClass.R; } } // End of Matching NI if (runEnd == sequence.length) // End boundary. trail type = eos { trailType = sequence.eos; } else { trailType = (BidiClass)sequence.types[runEnd]; if (trailType == BidiClass.AN || trailType == BidiClass.EN) { trailType = BidiClass.R; } } if (leadType == trailType) { sequence.SetRunTypes(start, runEnd, leadType); } else // N2 { // Remaining NIs take current run embedding level var runDirection = GetTypeForLevel(sequence.level); sequence.SetRunTypes(start, runEnd, runDirection); } i = runEnd; } } }
// 3.3.3 Resolve Weak Types private static void ResolveWeaks(this IsolatingRunSequence sequence) { // W1 NSM for (int i = 0; i < sequence.length; i++) { var ct = (BidiClass)sequence.types[i]; var prevType = i == 0 ? sequence.sos : (BidiClass)sequence.types[i - 1]; if (ct == BidiClass.NSM) { // if NSM is at start of sequence resolved to sos type // assign ON if previous is isolate initiator or PDI, otherwise type of previous bool isIsolateOrPDI = prevType == BidiClass.LRI || prevType == BidiClass.RLI || prevType == BidiClass.FSI || prevType == BidiClass.PDI; sequence.types[i] = isIsolateOrPDI ? (byte)BidiClass.ON : (byte)prevType; } } // W2 EN // At each EN search in backward until first strong type is found, if AL is found then resolve to AN for (int i = 0; i < sequence.length; i++) { var chType = (BidiClass)sequence.types[i]; if (chType == BidiClass.EN) { for (int j = i - 1; j >= 0; j--) { var type = (BidiClass)sequence.types[j]; if (type == BidiClass.R || type == BidiClass.AL || type == BidiClass.L) { if (type == BidiClass.AL) { sequence.types[i] = (byte)BidiClass.AN; break; } } } } } // W3 AL // Resolve all ALs to R for (int i = 0; i < sequence.length; i++) { if ((BidiClass)sequence.types[i] == BidiClass.AL) { sequence.types[i] = (byte)BidiClass.R; } } // W4 ES, CS (Number Separators) // ES between EN is resolved to EN // Single CS between same numbers type is resolve to that number type for (int i = 1; i < sequence.length - 1; i++) { var cct = (BidiClass)sequence.types[i]; var prevType = (BidiClass)sequence.types[i - 1]; var nextType = (BidiClass)sequence.types[i + 1]; if (cct == BidiClass.ES && prevType == BidiClass.EN && nextType == BidiClass.EN) // EN ES EN -> EN EN EN { sequence.types[i] = (byte)BidiClass.EN; } else if (cct == BidiClass.CS && ( prevType == BidiClass.EN && nextType == BidiClass.EN || prevType == BidiClass.AN && nextType == BidiClass.AN)) // EN CS EN -> EN EN EN, AN CS AN -> AN AN AN { sequence.types[i] = (byte)prevType; } } // W5 ET(s) adjacent to EN resolve to EN(s) var typesSet = new BidiClass[] { BidiClass.ET }; for (int i = 0; i < sequence.length; i++) { if ((BidiClass)sequence.types[i] == BidiClass.ET) { int runStart = i; // int runEnd = runStart; // runEnd = Array.FindIndex(sequence.types, runStart, t1 => typesSet.Any(t2 => t2 == (BidiClass)t1)); int runEnd = sequence.GetRunLimit(runStart, sequence.length, typesSet); var type = runStart > 0 ? (BidiClass)sequence.types[runStart - 1] : sequence.sos; if (type != BidiClass.EN) { type = runEnd < sequence.length ? (BidiClass)sequence.types[runEnd] : sequence.eos; // End type } if (type == BidiClass.EN) { sequence.SetRunTypes(runStart, runEnd, BidiClass.EN); // Resolve to EN } i = runEnd; // advance to end of sequence } } // W6 Separators and Terminators -> ON for (int i = 0; i < sequence.length; i++) { var t = (BidiClass)sequence.types[i]; if (t == BidiClass.ET || t == BidiClass.ES || t == BidiClass.CS) { sequence.types[i] = (byte)BidiClass.ON; } } // W7 same as W2 but EN -> L for (int i = 0; i < sequence.length; i++) { if ((BidiClass)sequence.types[i] == BidiClass.EN) { var prevStrong = sequence.sos; // Default to sos if reached start for (int j = i - 1; j >= 0; j--) { var t = (BidiClass)sequence.types[j]; if (t == BidiClass.R || t == BidiClass.L || t == BidiClass.AL) { prevStrong = t; break; } if (prevStrong == BidiClass.L) { sequence.types[i] = (byte)BidiClass.L; } } } } }
// 3.3.2 Determine Explicit Embedding Levels and directions private static void GetExplicitEmbeddingLevels(byte level, byte[] types, ref byte[] levels, int[] matchingPDI) { // X1. // Directional Status Stack and entry Stack <DirectionalStatus> dirStatusStack = new Stack <DirectionalStatus>(MAX_DEPTH + 2); DirectionalStatus dirEntry = new DirectionalStatus { paragraphEmbeddingLevel = level, directionalOverrideStatus = (int)BidiClass.ON, directionalIsolateStatus = false }; dirStatusStack.Push(dirEntry); int overflowIsolateCount = 0; int overflowEmbeddingCount = 0; int validIsolateCount = 0; // X2-X8 for (int i = 0; i < types.Length; i++) { BidiClass cct = (BidiClass)types[i]; switch (cct) { case BidiClass.RLE: case BidiClass.RLO: case BidiClass.LRE: case BidiClass.LRO: case BidiClass.LRI: case BidiClass.RLI: case BidiClass.FSI: { byte newLevel; // New calculated embedding level bool isIsolate = (cct == BidiClass.RLI || cct == BidiClass.LRI); // X5a, X5b .1 isolate embedding level if (isIsolate) { levels[i] = dirStatusStack.Peek().paragraphEmbeddingLevel; } // X5c. Get embedding level of characters between FSI and its matching PDI // FSI = RLI if embedding level is 1, otherwise LRI if (cct == BidiClass.FSI) { byte el = GetParagraphEmbeddingLevel(types, matchingPDI, i + 1, matchingPDI[i]); cct = el == 1 ? BidiClass.RLI : BidiClass.LRI; } // 1 (RLE RLO RLI, LRE LRO LRI) Compute least odd/even embedding level greater than embedding level // of last entry on directional status stack if (cct == BidiClass.RLE || cct == BidiClass.RLO || cct == BidiClass.RLI) { newLevel = (byte)LeastGreaterOdd(dirStatusStack.Peek().paragraphEmbeddingLevel); } else { newLevel = (byte)LeastGreaterEven(dirStatusStack.Peek().paragraphEmbeddingLevel); } // 2 New level would be valid(level <= max_depth) and overflow isolate count and // overflow embedding count are both zero => this RLE is valid, increment isolate counter. if (newLevel <= MAX_DEPTH && overflowIsolateCount == 0 && overflowEmbeddingCount == 0) { // X5b .3 if (isIsolate) { validIsolateCount++; } // Push new entry to stack byte dos = cct == BidiClass.RLO ? (byte)BidiClass.R // RLO = R directional override status : cct == BidiClass.LRO ? (byte)BidiClass.L // LRO = L directional override status : (byte)BidiClass.ON; // All rest are neutrals dirStatusStack.Push(new DirectionalStatus() { paragraphEmbeddingLevel = newLevel, directionalOverrideStatus = dos, directionalIsolateStatus = isIsolate }); } // 3 Otherwise, this is an overflow RLE. If the overflow isolate count is zero, // increment the overflow embedding count by one. Leave all other variables unchanged. else { if (overflowIsolateCount == 0) { overflowEmbeddingCount++; } } } break; // X6a Terminating Isolates case BidiClass.PDI: { if (overflowIsolateCount > 0) // This PDI matches an overflow isolate initiator { overflowIsolateCount--; } else if (validIsolateCount == 0) { // No matching isolator (valid or overflow), do nothing } else // This PDI matches a valid isolate initiator { overflowEmbeddingCount = 0; while (dirStatusStack.Peek().directionalIsolateStatus == false) { dirStatusStack.Pop(); } dirStatusStack.Pop(); validIsolateCount--; } levels[i] = dirStatusStack.Peek().paragraphEmbeddingLevel; } break; // X7 case BidiClass.PDF: { if (overflowIsolateCount > 0) // X7 .1 { // Do nothing } else if (overflowEmbeddingCount > 0) // X7 .2 { overflowEmbeddingCount--; } else if (!dirStatusStack.Peek().directionalIsolateStatus&& dirStatusStack.Count > 1) // X7 .3 { dirStatusStack.Pop(); } else { // Do nothing } } break; // X8 case BidiClass.B: { // Paragraph separators. // Applied at the end of paragraph (last character in array). // 1 Terminate(reset) all directional embeddings, overrides and isolates overflowEmbeddingCount = 0; overflowIsolateCount = 0; validIsolateCount = 0; dirStatusStack.Clear(); // Also pop off initialization entry // 2 Assign separator character an embedding level equal to paragraph embedding level levels[i] = level; } break; // X6 Non-formatting characters default: { levels[i] = dirStatusStack.Peek().paragraphEmbeddingLevel; if (dirStatusStack.Peek().directionalOverrideStatus != (int)BidiClass.ON) // X6.b (6.2.0 naming) { types[i] = dirStatusStack.Peek().directionalOverrideStatus; // reset type to last element status } } break; } } }