Beispiel #1
0
        /// <summary>
        /// Helper function for get_NormalizedFormAndFixOffsets below.
        /// Take indexes from original string segment, and figure out what indexes they correspond to in the
        /// corresponding segment of the decomposed output string. Also keep track of whether a given match
        /// is the *first* offset of the decomposed segment, because when fixing up offsets of selections,
        /// an offset that pointed to (say) LATIN SMALL LETTER U WITH HOOK should end up pointing to the
        /// decomposed LATIN SMALL LETTER U, and should never end up pointing to COMBINING HOOK ABOVE.
        /// Algorithm: decompose each codepoint of the original segment one at a time, and match it up with
        /// the codepoints of the normalized segment.
        /// </summary>
        /// <param name="segment">Segment of original string</param>
        /// <param name="normalizedSegment">Corresponding segment from normalized string</param>
        /// <param name="icuNormalizer">ICU normalizer that created the corresponding segment</param>
        /// <returns></returns>
        private IEnumerable <RearrangedIndexMapping> MatchUpIndexesAfterNormalization(string segment, string normalizedSegment, IntPtr icuNormalizer)
        {
            // We'll want to preserve (and later, return) the indexes of the *characters*, which won't
            // be the same as the indexes of the codepoints if there are any surrogate pairs involved.
            List <KeyValuePair <int, int> > origCodepointsByIndex = CodepointsByIndex(segment);
            List <KeyValuePair <int, int> > normCodepointsByIndex = CodepointsByIndex(normalizedSegment);
            var sentinel = new KeyValuePair <int, int>(-1, -1);            // Value that can never match a real index/codepoint pair

            foreach (KeyValuePair <int, int> indexAndCodePoint in origCodepointsByIndex)
            {
                int    origIdx       = indexAndCodePoint.Key;
                int    origCodePoint = indexAndCodePoint.Value;
                string normalizedStringFromOrigCodePoint = Icu.GetDecompositionFromUtf32(icuNormalizer, origCodePoint);
                foreach (KeyValuePair <int, int> indexAndResultingCodePoint in CodepointsByIndex(normalizedStringFromOrigCodePoint))
                {
                    int resultingCodePoint = indexAndResultingCodePoint.Value;
                    // Some algorithms (like fixing up offsets) care about finding the first character of the decomposition -- because if an
                    // offset pointed to U-WITH-HOOK before NFD, we want that offset to end up pointing at the U, not at the combining hook.
                    bool isFirstChar = indexAndResultingCodePoint.Key == 0;
                    int  i           = normCodepointsByIndex.FindIndex(kv => kv.Value == resultingCodePoint);
                    if (i < 0)                     // Should never happen, but let's guard against it anyway
                    {
                        continue;
                    }
                    // i is an index of *codepoints*. To properly match things up, we need a *character* index. Good thing we stored one!
                    int matchingIdxInNormalizedSegment = normCodepointsByIndex[i].Key;
                    normCodepointsByIndex[i] = sentinel;                     // Ensure we won't match this position ever again
                    yield return(new RearrangedIndexMapping(origIdx, matchingIdxInNormalizedSegment, isFirstChar));
                }
            }
        }