/// <summary> /// Helper function for get_NormalizedFormAndFixOffsets below. /// Take indexes from original string segment, and figure out what indexes they correspond to in the /// corresponding segment of the decomposed output string. Also keep track of whether a given match /// is the *first* offset of the decomposed segment, because when fixing up offsets of selections, /// an offset that pointed to (say) LATIN SMALL LETTER U WITH HOOK should end up pointing to the /// decomposed LATIN SMALL LETTER U, and should never end up pointing to COMBINING HOOK ABOVE. /// Algorithm: decompose each codepoint of the original segment one at a time, and match it up with /// the codepoints of the normalized segment. /// </summary> /// <param name="segment">Segment of original string</param> /// <param name="normalizedSegment">Corresponding segment from normalized string</param> /// <param name="icuNormalizer">ICU normalizer that created the corresponding segment</param> /// <returns></returns> private IEnumerable <RearrangedIndexMapping> MatchUpIndexesAfterNormalization(string segment, string normalizedSegment, IntPtr icuNormalizer) { // We'll want to preserve (and later, return) the indexes of the *characters*, which won't // be the same as the indexes of the codepoints if there are any surrogate pairs involved. List <KeyValuePair <int, int> > origCodepointsByIndex = CodepointsByIndex(segment); List <KeyValuePair <int, int> > normCodepointsByIndex = CodepointsByIndex(normalizedSegment); var sentinel = new KeyValuePair <int, int>(-1, -1); // Value that can never match a real index/codepoint pair foreach (KeyValuePair <int, int> indexAndCodePoint in origCodepointsByIndex) { int origIdx = indexAndCodePoint.Key; int origCodePoint = indexAndCodePoint.Value; string normalizedStringFromOrigCodePoint = Icu.GetDecompositionFromUtf32(icuNormalizer, origCodePoint); foreach (KeyValuePair <int, int> indexAndResultingCodePoint in CodepointsByIndex(normalizedStringFromOrigCodePoint)) { int resultingCodePoint = indexAndResultingCodePoint.Value; // Some algorithms (like fixing up offsets) care about finding the first character of the decomposition -- because if an // offset pointed to U-WITH-HOOK before NFD, we want that offset to end up pointing at the U, not at the combining hook. bool isFirstChar = indexAndResultingCodePoint.Key == 0; int i = normCodepointsByIndex.FindIndex(kv => kv.Value == resultingCodePoint); if (i < 0) // Should never happen, but let's guard against it anyway { continue; } // i is an index of *codepoints*. To properly match things up, we need a *character* index. Good thing we stored one! int matchingIdxInNormalizedSegment = normCodepointsByIndex[i].Key; normCodepointsByIndex[i] = sentinel; // Ensure we won't match this position ever again yield return(new RearrangedIndexMapping(origIdx, matchingIdxInNormalizedSegment, isFirstChar)); } } }