/// <summary> /// Given an ICU normalizer, enumerate the limit indices of the "segments" of this string. /// A "segment" is defined as a group of characters that interact with each other in this /// normalization, and which therefore can't be split apart and normalized separately without /// changing the result of the normalization. For example, under NFC, if LATIN SMALL LETTER C (U+0063) /// is followed by COMBINING CEDILLA (U+0327) which is followed by LATIN SMALL LETTER D (U+0064), /// then the c and cedilla will form one "segment": splitting them apart and normalizing them /// separately would produce a different result than normalizing them together. So this function /// would yield (among other values) the index of LATIN SMALL LETTER D, the first index that is /// not part of the segment (that is, the limit index). /// /// The last index yielded by this function will be equal to the length of the string, and it /// will never yield the index 0. (If the string is empty, it will return an empty enumerable). /// Therefore, it is always safe to do GetChars(previousIndex, thisIndex) in a foreach loop to get /// the "current" segment (assuming previousIndex is set to 0 the first time through the loop). /// </summary> /// <param name="icuNormalizer">IntPtr to the ICU normalizer to use (get this from Icu.GetIcuNormalizer)</param> /// <returns>An enumerable of indexes into "this" TsString, at all the normalization "segment" boundaries, suitable for passing into GetChars(prevIdx, thisIdx)</returns> private IEnumerable <int> EnumerateSegmentLimits(IntPtr icuNormalizer) { if (String.IsNullOrEmpty(Text)) { yield break; } int i = 0; while (i < Text.Length) { int codepoint = Char.ConvertToUtf32(Text, i); if (Icu.HasNormalizationBoundaryBefore(icuNormalizer, codepoint) && i > 0) { yield return(i); } i += codepoint > 0xffff ? 2 : 1; } yield return(Text.Length); }