// Implementation of both get_NormalizedForm and NfdAndFixOffsets private ITsString get_NormalizedFormAndFixOffsets(FwNormalizationMode nm, ArrayPtr oldOffsetsToFix, int numOffsetsToFix) { // Can we skip unnecessary work? if (IsAlreadyNormalized(nm)) { return(this); } if (string.IsNullOrEmpty(Text)) { NoteAlreadyNormalized(nm); return(this); } if (nm == FwNormalizationMode.knmLim) { throw new ArgumentException("Normalization mode may not be knmLim", "nm"); } // NFSC needs to be decomposed first, then recomposed as NFC. if (nm == FwNormalizationMode.knmNFSC && !get_IsNormalizedForm(FwNormalizationMode.knmNFD)) { var nfd = (TsString)get_NormalizedForm(FwNormalizationMode.knmNFD); // Line below is *not* a typo; this call will not recurse infinitely. return(nfd.get_NormalizedFormAndFixOffsets(FwNormalizationMode.knmNFSC, oldOffsetsToFix, numOffsetsToFix)); } bool willFixOffsets = numOffsetsToFix > 0 && oldOffsetsToFix != null && oldOffsetsToFix.IntPtr != IntPtr.Zero; // Keys = offsets into original string, values = offsets into normalized string var stringOffsetMapping = willFixOffsets ? new Dictionary <int, int>() : null; // Don't allocate an object if we'll never use it Icu.UNormalizationMode icuMode = (nm == FwNormalizationMode.knmNFSC) ? Icu.UNormalizationMode.UNORM_NFC : (Icu.UNormalizationMode)nm; IntPtr icuNormalizer = Icu.GetIcuNormalizer(icuMode); TsStrBldr resultBuilder = new TsStrBldr(); int segmentMin = 0; foreach (int segmentLim in EnumerateSegmentLimits(icuNormalizer)) { string segment = GetChars(segmentMin, segmentLim); string normalizedSegment = Icu.Normalize(segment, icuNormalizer); int curRun = get_RunAt(segmentMin); int curRunLim = get_LimOfRun(curRun); ITsTextProps curTextProps = get_Properties(curRun); if (curRunLim >= segmentLim) { // The segment is contained entirely in the current run, so our job is simple int outputLenSoFar = resultBuilder.Length; resultBuilder.Replace(outputLenSoFar, outputLenSoFar, normalizedSegment, curTextProps); // Calculate the orig -> norm index mappings if (and only if) they're needed, since this calculation is expensive if (willFixOffsets) { foreach (RearrangedIndexMapping mapping in MatchUpIndexesAfterNormalization(segment, normalizedSegment, icuNormalizer)) { // Note that our local mapping is from the start of this segment, but we want to keep track of indexes from the start // of the *string*. (Both the original string and the output, normalized string). So we adjust the indexes here. if (mapping.isFirstCharOfDecomposition) { stringOffsetMapping[segmentMin + mapping.origIdx] = outputLenSoFar + mapping.normIdx; } } } } else { // The segment straddles two runs, so our job is harder. We have to either deal with decomposition // rearranging things (and make sure the right characters maintain the right text properties), or // else we have to deal with composition possibly trying to "compress" some diacritics that straddle // a run border (which can happen, for example, if they have different text properties). if (nm == FwNormalizationMode.knmNFD || nm == FwNormalizationMode.knmNFKD) { // Decomposition: we have to deal with rearranging. Some characters from after the first run's // endpoint may have ended up "inside" the first run after rearranging, so their text properties // will be incorrect at first. We'll fix them up after calculating the orig -> norm index mappings. int outputLenSoFar = resultBuilder.Length; // This will be the start index from which resultBuilder.Replace(outputLenSoFar, outputLenSoFar, normalizedSegment, curTextProps); // Now correct the text properties, one index at a time. IEnumerable <RearrangedIndexMapping> indexMappings = MatchUpIndexesAfterNormalization(segment, normalizedSegment, icuNormalizer); foreach (RearrangedIndexMapping mapping in indexMappings) { ITsTextProps origProperties = get_PropertiesAt(segmentMin + mapping.origIdx); int outputIdx = outputLenSoFar + mapping.normIdx; int size = Char.IsSurrogate(normalizedSegment, mapping.normIdx) ? 2 : 1; resultBuilder.SetProperties(outputIdx, outputIdx + size, origProperties); // And if we also need to fix up offsets at the end, we keep track of the ones we'll need if (willFixOffsets && mapping.isFirstCharOfDecomposition) { stringOffsetMapping[segmentMin + mapping.origIdx] = outputLenSoFar + mapping.normIdx; } } } else if (nm == FwNormalizationMode.knmNFSC) { // Composition that preserves styles. By this point, our input is NFD so we at least know there will be no rearranging. // If there is more than one character remaining in the current run, then we might be able to compose those, at least. if (curRunLim - segmentMin > 1) { // Unicode canonical ordering is such that any subsequence of a composed character can itself be composed, so this is safe. string remainderOfFirstRun = GetChars(segmentMin, curRunLim); string normalizedRemainder = Icu.Normalize(remainderOfFirstRun, icuNormalizer); resultBuilder.Replace(resultBuilder.Length, resultBuilder.Length, normalizedRemainder, curTextProps); // Now the start of the un-composable part is just the limit of the first run (which is the start of the second run). segmentMin = curRunLim; } // Now there could be any NUMBER of runs between currentInputIdx and segmentLim. Maybe there are TEN composing // characters, each with different text properties (and thus different runs). However, since the base character // was in the first run, none of the characters from the second or subsequent runs are composable any longer. So we // can copy them to the output as-is as one big TsString, which will carry text, runs and all. ITsString uncomposablePartOfSegment = GetSubstring(segmentMin, segmentLim); resultBuilder.ReplaceTsString(resultBuilder.Length, resultBuilder.Length, uncomposablePartOfSegment); } else { // For NFC and NFKC, we do not try to preserve styles or offset mappings, so this branch is quite simple int outputLenSoFar = resultBuilder.Length; resultBuilder.Replace(outputLenSoFar, outputLenSoFar, normalizedSegment, curTextProps); } } segmentMin = segmentLim; // Next segment will start where the current segment ended } if (willFixOffsets) { stringOffsetMapping[segmentMin] = resultBuilder.Length; int ptrSize = Marshal.SizeOf(typeof(IntPtr)); for (int i = 0; i < numOffsetsToFix; i++) { IntPtr offsetPtr = Marshal.ReadIntPtr(oldOffsetsToFix.IntPtr, i * ptrSize); int oldOffset = Marshal.ReadInt32(offsetPtr); int newOffset; if (stringOffsetMapping.TryGetValue(oldOffset, out newOffset)) { Marshal.WriteInt32(offsetPtr, newOffset); } else { // The only likely way for one of the offsets we've been asked to fix up to NOT // be found in the offset mapping dictionary is if it happened to be an offset // to the second half of a surrogate pair. In which case we want to fix it up to // point to wherever the first half of that pair ended up, so searching downwards // through the offset mapping dictionary will find the best match. bool found = false; while (!found && oldOffset > 0) { oldOffset--; found = stringOffsetMapping.TryGetValue(oldOffset, out newOffset); } // Any offset that could not be matched at all will be pointed at the beginning // of the TsString, since that's safe with strings of all sizes (including empty). Marshal.WriteInt32(offsetPtr, found ? newOffset : 0); } } } var result = (TsString)resultBuilder.GetString(); result.NoteAlreadyNormalized(nm); // So we won't have to do all this work a second time return(result); }