Exemplo n.º 1
0
        /// <summary>
        /// Return whether the string is already in the specified normal form.
        /// Note that a string may be considered to be in NFC
        /// even though its text (the plain character sequence) is not.
        /// This is because we don't collapse otherwise collapsible pairs if they
        /// have different style properties.
        /// </summary>
        public bool get_IsNormalizedForm(FwNormalizationMode nm)
        {
            if (IsAlreadyNormalized(nm))
            {
                return(true);
            }
            if (string.IsNullOrEmpty(Text))
            {
                NoteAlreadyNormalized(nm);
                return(true);
            }

            var normalizer = CustomIcu.GetIcuNormalizer(nm);

            if (normalizer.IsNormalized(Text))
            {
                // Don't do this work a second time
                if (nm == FwNormalizationMode.knmNFSC)
                {
                    NoteAlreadyNormalized(FwNormalizationMode.knmNFC);                     // NFC includes NFSC
                }
                else
                {
                    NoteAlreadyNormalized(nm);
                }
                return(true);
            }
            if (nm == FwNormalizationMode.knmNFSC)
            {
                // NFSC is a special case, where we just have to normalize and compare.
                if (Equals(get_NormalizedForm(nm)))
                {
                    NoteAlreadyNormalized(nm);
                    return(true);
                }
            }
            return(false);
        }
Exemplo n.º 2
0
        // Implementation of both get_NormalizedForm and NfdAndFixOffsets
        private ITsString get_NormalizedFormAndFixOffsets(FwNormalizationMode nm, ArrayPtr oldOffsetsToFix, int numOffsetsToFix)
        {
            // Can we skip unnecessary work?
            if (IsAlreadyNormalized(nm))
            {
                return(this);
            }
            if (string.IsNullOrEmpty(Text))
            {
                NoteAlreadyNormalized(nm);
                return(this);
            }

            if (nm == FwNormalizationMode.knmLim)
            {
                throw new ArgumentException("Normalization mode may not be knmLim", "nm");
            }

            // NFSC needs to be decomposed first, then recomposed as NFC.
            if (nm == FwNormalizationMode.knmNFSC && !get_IsNormalizedForm(FwNormalizationMode.knmNFD))
            {
                var nfd = (TsString)get_NormalizedForm(FwNormalizationMode.knmNFD);
                // Line below is *not* a typo; this call will not recurse infinitely.
                return(nfd.get_NormalizedFormAndFixOffsets(FwNormalizationMode.knmNFSC, oldOffsetsToFix, numOffsetsToFix));
            }

            bool willFixOffsets = numOffsetsToFix > 0 && oldOffsetsToFix != null && oldOffsetsToFix.IntPtr != IntPtr.Zero;
            // Keys = offsets into original string, values = offsets into normalized string
            var stringOffsetMapping = willFixOffsets ? new Dictionary <int, int>() : null;            // Don't allocate an object if we'll never use it

            var icuNormalizer = CustomIcu.GetIcuNormalizer(nm);

            TsStrBldr resultBuilder = new TsStrBldr();
            int       segmentMin    = 0;

            foreach (int segmentLim in EnumerateSegmentLimits(icuNormalizer))
            {
                string       segment           = GetChars(segmentMin, segmentLim);
                string       normalizedSegment = icuNormalizer.Normalize(segment);
                int          curRun            = get_RunAt(segmentMin);
                int          curRunLim         = get_LimOfRun(curRun);
                ITsTextProps curTextProps      = get_Properties(curRun);
                if (curRunLim >= segmentLim)
                {
                    // The segment is contained entirely in the current run, so our job is simple
                    int outputLenSoFar = resultBuilder.Length;
                    resultBuilder.Replace(outputLenSoFar, outputLenSoFar, normalizedSegment, curTextProps);
                    // Calculate the orig -> norm index mappings if (and only if) they're needed, since this calculation is expensive
                    if (willFixOffsets)
                    {
                        foreach (RearrangedIndexMapping mapping in MatchUpIndexesAfterNormalization(segment, normalizedSegment, icuNormalizer))
                        {
                            // Note that our local mapping is from the start of this segment, but we want to keep track of indexes from the start
                            // of the *string*. (Both the original string and the output, normalized string). So we adjust the indexes here.
                            if (mapping.isFirstCharOfDecomposition)
                            {
                                stringOffsetMapping[segmentMin + mapping.origIdx] = outputLenSoFar + mapping.normIdx;
                            }
                        }
                    }
                }
                else
                {
                    // The segment straddles two runs, so our job is harder. We have to either deal with decomposition
                    // rearranging things (and make sure the right characters maintain the right text properties), or
                    // else we have to deal with composition possibly trying to "compress" some diacritics that straddle
                    // a run border (which can happen, for example, if they have different text properties).

                    if (nm == FwNormalizationMode.knmNFD || nm == FwNormalizationMode.knmNFKD)
                    {
                        // Decomposition: we have to deal with rearranging. Some characters from after the first run's
                        // endpoint may have ended up "inside" the first run after rearranging, so their text properties
                        // will be incorrect at first. We'll fix them up after calculating the orig -> norm index mappings.

                        int outputLenSoFar = resultBuilder.Length;                         // This will be the start index from which
                        resultBuilder.Replace(outputLenSoFar, outputLenSoFar, normalizedSegment, curTextProps);

                        // Now correct the text properties, one index at a time.
                        IEnumerable <RearrangedIndexMapping> indexMappings = MatchUpIndexesAfterNormalization(segment, normalizedSegment, icuNormalizer);
                        foreach (RearrangedIndexMapping mapping in indexMappings)
                        {
                            ITsTextProps origProperties = get_PropertiesAt(segmentMin + mapping.origIdx);
                            int          outputIdx      = outputLenSoFar + mapping.normIdx;
                            int          size           = Char.IsSurrogate(normalizedSegment, mapping.normIdx) ? 2 : 1;
                            resultBuilder.SetProperties(outputIdx, outputIdx + size, origProperties);
                            // And if we also need to fix up offsets at the end, we keep track of the ones we'll need
                            if (willFixOffsets && mapping.isFirstCharOfDecomposition)
                            {
                                stringOffsetMapping[segmentMin + mapping.origIdx] = outputLenSoFar + mapping.normIdx;
                            }
                        }
                    }

                    else if (nm == FwNormalizationMode.knmNFSC)
                    {
                        // Composition that preserves styles. By this point, our input is NFD so we at least know there will be no rearranging.

                        // If there is more than one character remaining in the current run, then we might be able to compose those, at least.
                        if (curRunLim - segmentMin > 1)
                        {
                            // Unicode canonical ordering is such that any subsequence of a composed character can itself be composed, so this is safe.
                            string remainderOfFirstRun = GetChars(segmentMin, curRunLim);
                            string normalizedRemainder = icuNormalizer.Normalize(remainderOfFirstRun);
                            resultBuilder.Replace(resultBuilder.Length, resultBuilder.Length, normalizedRemainder, curTextProps);
                            // Now the start of the un-composable part is just the limit of the first run (which is the start of the second run).
                            segmentMin = curRunLim;
                        }
                        // Now there could be any NUMBER of runs between currentInputIdx and segmentLim. Maybe there are TEN composing
                        // characters, each with different text properties (and thus different runs). However, since the base character
                        // was in the first run, none of the characters from the second or subsequent runs are composable any longer. So we
                        // can copy them to the output as-is as one big TsString, which will carry text, runs and all.
                        ITsString uncomposablePartOfSegment = GetSubstring(segmentMin, segmentLim);
                        resultBuilder.ReplaceTsString(resultBuilder.Length, resultBuilder.Length, uncomposablePartOfSegment);
                    }

                    else
                    {
                        // For NFC and NFKC, we do not try to preserve styles or offset mappings, so this branch is quite simple
                        int outputLenSoFar = resultBuilder.Length;
                        resultBuilder.Replace(outputLenSoFar, outputLenSoFar, normalizedSegment, curTextProps);
                    }
                }
                segmentMin = segmentLim;                 // Next segment will start where the current segment ended
            }
            if (willFixOffsets)
            {
                stringOffsetMapping[segmentMin] = resultBuilder.Length;
                int ptrSize = Marshal.SizeOf(typeof(IntPtr));
                for (int i = 0; i < numOffsetsToFix; i++)
                {
                    IntPtr offsetPtr = Marshal.ReadIntPtr(oldOffsetsToFix.IntPtr, i * ptrSize);
                    int    oldOffset = Marshal.ReadInt32(offsetPtr);
                    int    newOffset;
                    if (stringOffsetMapping.TryGetValue(oldOffset, out newOffset))
                    {
                        Marshal.WriteInt32(offsetPtr, newOffset);
                    }
                    else
                    {
                        // The only likely way for one of the offsets we've been asked to fix up to NOT
                        // be found in the offset mapping dictionary is if it happened to be an offset
                        // to the second half of a surrogate pair. In which case we want to fix it up to
                        // point to wherever the first half of that pair ended up, so searching downwards
                        // through the offset mapping dictionary will find the best match.
                        bool found = false;
                        while (!found && oldOffset > 0)
                        {
                            oldOffset--;
                            found = stringOffsetMapping.TryGetValue(oldOffset, out newOffset);
                        }
                        // Any offset that could not be matched at all will be pointed at the beginning
                        // of the TsString, since that's safe with strings of all sizes (including empty).
                        Marshal.WriteInt32(offsetPtr, found ? newOffset : 0);
                    }
                }
            }
            var result = (TsString)resultBuilder.GetString();

            result.NoteAlreadyNormalized(nm);             // So we won't have to do all this work a second time
            return(result);
        }