// On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where // the next char would have been consumed from / the next byte would have been written to. // inputLength in chars, outputBytesRemaining in bytes. public static OperationStatus TranscodeToUtf8(char *pInputBuffer, int inputLength, byte *pOutputBuffer, int outputBytesRemaining, out char *pInputBufferRemaining, out byte *pOutputBufferRemaining) { Debug.Assert(inputLength >= 0, "Input length must not be negative."); Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); Debug.Assert(outputBytesRemaining >= 0, "Destination length must not be negative."); Debug.Assert(pOutputBuffer != null || outputBytesRemaining == 0, "Destination length must be zero if destination buffer pointer is null."); var input = new ReadOnlySpan <char>(pInputBuffer, inputLength); var output = new Span <byte>(pOutputBuffer, outputBytesRemaining); OperationStatus opStatus = OperationStatus.Done; while (!input.IsEmpty) { opStatus = Rune.DecodeFromUtf16(input, out Rune rune, out int charsConsumedJustNow); if (opStatus != OperationStatus.Done) { break; } if (!rune.TryEncodeToUtf8(output, out int bytesWrittenJustNow)) { opStatus = OperationStatus.DestinationTooSmall; break; } input = input.Slice(charsConsumedJustNow); output = output.Slice(bytesWrittenJustNow); } pInputBufferRemaining = pInputBuffer + inputLength - input.Length; pOutputBufferRemaining = pOutputBuffer + outputBytesRemaining - output.Length; return(opStatus); }
private static bool TryParseSearchTermAsRune(object searchTerm, out Rune parsed) { if (searchTerm is char ch) { return(Rune.TryCreate(ch, out parsed)); } else if (searchTerm is Rune r) { parsed = r; return(true); } else if (searchTerm is string str) { if (Rune.DecodeFromUtf16(str, out parsed, out int charsConsumed) == OperationStatus.Done && charsConsumed == str.Length) { return(true); } } else if (searchTerm is ustring ustr) { if (Rune.DecodeFromUtf8(ustr.AsBytes(), out parsed, out int bytesConsumed) == OperationStatus.Done && bytesConsumed == ustr.Length) { return(true); } } parsed = default; // failed to turn the search term into a single Rune return(false); }
// skips the call to FindFirstCharacterToEncode private protected virtual OperationStatus EncodeCore(ReadOnlySpan <char> source, Span <char> destination, out int charsConsumed, out int charsWritten, bool isFinalBlock) { int originalSourceLength = source.Length; int originalDestinationLength = destination.Length; while (!source.IsEmpty) { OperationStatus status = Rune.DecodeFromUtf16(source, out Rune scalarValue, out int charsConsumedJustNow); if (status != OperationStatus.Done) { if (!isFinalBlock && status == OperationStatus.NeedMoreData) { goto NeedMoreData; } Debug.Assert(scalarValue == Rune.ReplacementChar); // should be replacement char goto MustEncode; } if (!WillEncode(scalarValue.Value)) { if (!scalarValue.TryEncodeToUtf16(destination, out _)) { goto DestinationTooSmall; } source = source.Slice(charsConsumedJustNow); destination = destination.Slice(charsConsumedJustNow); // reflecting input directly to the output, same # of chars written continue; } MustEncode: if (!TryEncodeUnicodeScalar((uint)scalarValue.Value, destination, out int charsWrittenJustNow)) { goto DestinationTooSmall; } source = source.Slice(charsConsumedJustNow); destination = destination.Slice(charsWrittenJustNow); } // And we're finished! OperationStatus retVal = OperationStatus.Done; ReturnCommon: charsConsumed = originalSourceLength - source.Length; charsWritten = originalDestinationLength - destination.Length; return(retVal); NeedMoreData: retVal = OperationStatus.NeedMoreData; goto ReturnCommon; DestinationTooSmall: retVal = OperationStatus.DestinationTooSmall; goto ReturnCommon; }
public Utf16Splitter(ReadOnlySpan <char> span, ReadOnlySpan <char> separator, StringSplitOptions splitOptions) { ReadOnlySpan <char> separatorSlice = separator; do { var separatorStatus = Rune.DecodeFromUtf16(separatorSlice, out _, out int consumed); if (separatorStatus == OperationStatus.InvalidData) { throw new ArgumentException("The separator is not valid UTF16.", nameof(separator)); } separatorSlice = separatorSlice[consumed..];
private string GetExpectedEscapedRepresentation(string value) { StringBuilder builder = new StringBuilder(); for (int i = 0; i < value.Length;) { Rune.DecodeFromUtf16(value.AsSpan(i), out Rune nextRune, out int charsConsumed); builder.Append(GetExpectedEscapedRepresentation(nextRune)); i += charsConsumed; } return(builder.ToString()); }
// <SnippetExample> static ReadOnlySpan <char> TrimNonLettersAndNonDigits(ReadOnlySpan <char> span) { // First, trim from the front. // If any Rune can't be decoded // (return value is anything other than "Done"), // or if the Rune is a letter or digit, // stop trimming from the front and // instead work from the end. while (Rune.DecodeFromUtf16(span, out Rune rune, out int charsConsumed) == OperationStatus.Done) { if (Rune.IsLetterOrDigit(rune)) { break; } span = span[charsConsumed..];
private bool TryEncodeUnicodeScalarUtf8(uint unicodeScalar, Span <char> utf16ScratchBuffer, Span <byte> utf8Destination, out int bytesWritten) { if (!TryEncodeUnicodeScalar(unicodeScalar, utf16ScratchBuffer, out int charsWritten)) { // We really don't expect any encoder to exceed 24 escaped chars per input scalar. // If this happens, throw an exception and we can figure out if we want to support it // in the future. ThrowArgumentException_MaxOutputCharsPerInputChar(); } // Transcode chars -> bytes one at a time. utf16ScratchBuffer = utf16ScratchBuffer.Slice(0, charsWritten); int dstIdx = 0; while (!utf16ScratchBuffer.IsEmpty) { if (Rune.DecodeFromUtf16(utf16ScratchBuffer, out Rune nextScalarValue, out int scalarUtf16CodeUnitCount) != OperationStatus.Done) { // Wrote bad UTF-16 data, we cannot transcode to UTF-8. ThrowArgumentException_MaxOutputCharsPerInputChar(); } uint utf8lsb = (uint)UnicodeHelpers.GetUtf8RepresentationForScalarValue((uint)nextScalarValue.Value); do { if (SpanUtility.IsValidIndex(utf8Destination, dstIdx)) { utf8Destination[dstIdx++] = (byte)utf8lsb; } else { bytesWritten = 0; // ran out of space in the destination return(false); } } while ((utf8lsb >>= 8) != 0); utf16ScratchBuffer = utf16ScratchBuffer.Slice(scalarUtf16CodeUnitCount); } bytesWritten = dstIdx; return(true); }
public override unsafe int FindFirstCharacterToEncode(char *text, int textLength) { ReadOnlySpan <char> input = new ReadOnlySpan <char>(text, textLength); int idx = 0; while (Rune.DecodeFromUtf16(input.Slice(idx), out Rune result, out int charsConsumed) == OperationStatus.Done) { if (WillEncode(result.Value)) { // This character needs to be escaped. Break out. break; } idx += charsConsumed; } if (idx == input.Length) { // None of the characters in the string needs to be escaped. return(-1); } return(idx); }
private static bool IcuIsSortable(ReadOnlySpan <char> text) { Debug.Assert(!GlobalizationMode.Invariant); Debug.Assert(!GlobalizationMode.UseNls); Debug.Assert(!text.IsEmpty); do { if (Rune.DecodeFromUtf16(text, out Rune result, out int charsConsumed) != OperationStatus.Done) { return(false); // found an unpaired surrogate somewhere in the text } UnicodeCategory category = Rune.GetUnicodeCategory(result); if (category == UnicodeCategory.PrivateUse || category == UnicodeCategory.OtherNotAssigned) { return(false); // can't sort private use or unassigned code points } text = text.Slice(charsConsumed); } while (!text.IsEmpty); return(true); // saw no unsortable data in the buffer }
[InlineData(new char[] { '\ud800', '\u1234' }, OperationStatus.InvalidData, 0xFFFD, 1)] // standalone high surrogate public static void DecodeFromUtf16(char[] data, OperationStatus expectedOperationStatus, int expectedRuneValue, int expectedCharsConsumed) { Assert.Equal(expectedOperationStatus, Rune.DecodeFromUtf16(data, out Rune actualRune, out int actualCharsConsumed)); Assert.Equal(expectedRuneValue, actualRune.Value); Assert.Equal(expectedCharsConsumed, actualCharsConsumed); }
public void RunTest() { Console.WriteLine("-- BEGIN TEST --"); int encodingCharCount = Encoding.UTF8.GetCharCount(_data.Span); Console.WriteLine($"Encoding.UTF8.GetCharCount returned {encodingCharCount}."); { ReadOnlySpan <byte> input = _data.Span; int runeIterCharCount = 0; while (!input.IsEmpty) { Rune.DecodeFromUtf8(input, out Rune thisRune, out int bytesConsumed); runeIterCharCount += thisRune.Utf16SequenceLength; // ok if U+FFFD replacement input = input.Slice(bytesConsumed); } Console.WriteLine($"Rune iteration said there were {runeIterCharCount} UTF-16 chars."); if (encodingCharCount != runeIterCharCount) { throw new Exception("Rune iteration char count mismatch!!"); } } char[] chars = new char[encodingCharCount]; int charsWritten = Encoding.UTF8.GetChars(_data.Span, chars); Console.WriteLine($"Encoding.UTF8.GetChars returned {charsWritten} chars written."); if (encodingCharCount != charsWritten) { throw new Exception("GetChars return value mismatch!!"); } { ReadOnlySpan <byte> inputUtf8 = _data.Span; ReadOnlySpan <char> inputUtf16 = chars; while (!inputUtf8.IsEmpty && !inputUtf16.IsEmpty) { Rune.DecodeFromUtf8(inputUtf8, out Rune inputUtf8Rune, out int bytesConsumed); Rune.DecodeFromUtf16(inputUtf16, out Rune inputUtf16Rune, out int charsConsumed); if (inputUtf8Rune != inputUtf16Rune) { throw new Exception("Enumerating runes mismatch!!"); } inputUtf8 = inputUtf8.Slice(bytesConsumed); inputUtf16 = inputUtf16.Slice(charsConsumed); } if (inputUtf8.Length != inputUtf16.Length) { throw new Exception("Rune enumeration returned mismatched lengths!"); } } Console.WriteLine("Running ToUtf16 with replace=true and exact size buffer."); { char[] chars2 = new char[chars.Length]; OperationStatus opStatus = Utf8.ToUtf16(_data.Span, chars2, out int bytesReadJustNow, out int charsWrittenJustNow, replaceInvalidSequences: true, isFinalBlock: true); if (opStatus != OperationStatus.Done) { throw new Exception("Utf8.ToUtf16 returned wrong OperationStatus!!"); } if (bytesReadJustNow != _data.Memory.Length) { throw new Exception("Utf8.ToUtf16 didn't read entire input!!"); } if (charsWrittenJustNow != chars2.Length) { throw new Exception("Utf8.ToUtf16 didn't fill entire response buffer!!"); } if (!chars.SequenceEqual(chars2)) { throw new Exception("Utf8.ToUtf16 returned different data than Encoding.UTF8.GetChars!!"); } } Console.WriteLine("Running ToUtf16 with replace=true and extra large buffer."); { char[] chars2 = new char[chars.Length + 1024]; OperationStatus opStatus = Utf8.ToUtf16(_data.Span, chars2, out int bytesReadJustNow, out int charsWrittenJustNow, replaceInvalidSequences: true, isFinalBlock: true); if (opStatus != OperationStatus.Done) { throw new Exception("Utf8.ToUtf16 returned wrong OperationStatus!!"); } if (bytesReadJustNow != _data.Memory.Length) { throw new Exception("Utf8.ToUtf16 didn't read entire input!!"); } if (charsWrittenJustNow != chars.Length) { throw new Exception("Utf8.ToUtf16 didn't fill entire response buffer!!"); } if (!chars2.AsSpan(0, charsWrittenJustNow).SequenceEqual(chars)) { throw new Exception("Utf8.ToUtf16 returned different data than Encoding.UTF8.GetChars!!"); } } Console.WriteLine("Running ToUtf16 with replace=false and extra large buffer."); { ReadOnlySpan <byte> input = _data.Span; Span <char> output = new char[chars.Length + 1024]; while (!input.IsEmpty) { OperationStatus opStatus = Utf8.ToUtf16(input, output, out int bytesReadJustNow, out int charsWrittenJustNow, replaceInvalidSequences: false, isFinalBlock: true); ReadOnlySpan <byte> dataReadJustNow = input.Slice(0, bytesReadJustNow); ReadOnlySpan <char> dataWrittenJustNow = output.Slice(0, charsWrittenJustNow); while (!dataReadJustNow.IsEmpty && !dataWrittenJustNow.IsEmpty) { OperationStatus utf8Status = Rune.DecodeFromUtf8(dataReadJustNow, out Rune inputUtf8Rune, out int bytesConsumed); OperationStatus utf16Status = Rune.DecodeFromUtf16(dataWrittenJustNow, out Rune inputUtf16Rune, out int charsConsumed); if (utf8Status != OperationStatus.Done) { throw new Exception("DecodeFromUtf8 returned unexpected value!!"); } if (utf16Status != OperationStatus.Done) { throw new Exception("DecodeFromUtf16 returned unexpected value!!"); } if (inputUtf8Rune != inputUtf16Rune) { throw new Exception("Enumerating runes mismatch!!"); } dataReadJustNow = dataReadJustNow.Slice(bytesConsumed); dataWrittenJustNow = dataWrittenJustNow.Slice(charsConsumed); } if (dataReadJustNow.Length != dataWrittenJustNow.Length) { throw new Exception("Unexpected length mismatch!!"); } input = input.Slice(bytesReadJustNow); if (opStatus != OperationStatus.Done) { // Skip over invalid data Rune.DecodeFromUtf8(input, out _, out int bytesToSkip); input = input.Slice(bytesToSkip); } } } Console.WriteLine("Trying custom decoder replacement."); { // use a custom replacement string Encoding encoding = Encoding.GetEncoding("utf-8", EncoderFallback.ExceptionFallback, new DecoderReplacementFallback("{BAD}")); string decoded = encoding.GetString(_data.Span); ReadOnlySpan <byte> input = _data.Span; char[] decoded2 = new char[decoded.Length]; StringBuilder builder = new StringBuilder(); while (!input.IsEmpty) { OperationStatus opStatus = Utf8.ToUtf16(input, decoded2, out int bytesReadJustNow, out int charsWrittenJustNow, replaceInvalidSequences: false, isFinalBlock: true); builder.Append(decoded2, 0, charsWrittenJustNow); input = input.Slice(bytesReadJustNow); if (opStatus != OperationStatus.Done) { // Skip over invalid data Rune.DecodeFromUtf8(input, out _, out int bytesToSkip); input = input.Slice(bytesToSkip); builder.Append("{BAD}"); } } if (new string(decoded) != builder.ToString()) { throw new Exception("Custom decoder replacement failed!!"); } } Console.WriteLine("-- END TEST - SUCCESS --"); }
/* * OperationStatus-based APIs for transcoding of chunked data. * This method is similar to Encoding.UTF8.GetBytes / GetChars but has a * different calling convention, different error handling mechanisms, and * different performance characteristics. * * If 'replaceInvalidSequences' is true, the method will replace any ill-formed * subsequence in the source with U+FFFD when transcoding to the destination, * then it will continue processing the remainder of the buffers. Otherwise * the method will return OperationStatus.InvalidData. * * If the method does return an error code, the out parameters will represent * how much of the data was successfully transcoded, and the location of the * ill-formed subsequence can be deduced from these values. * * If 'replaceInvalidSequences' is true, the method is guaranteed never to return * OperationStatus.InvalidData. If 'isFinalBlock' is true, the method is * guaranteed never to return OperationStatus.NeedMoreData. */ /// <summary> /// Transcodes the UTF-16 <paramref name="source"/> buffer to <paramref name="destination"/> as UTF-8. /// </summary> /// <remarks> /// If <paramref name="replaceInvalidSequences"/> is <see langword="true"/>, invalid UTF-16 sequences /// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and /// this method will not return <see cref="OperationStatus.InvalidData"/>. /// </remarks> public static OperationStatus FromUtf16(ReadOnlySpan <char> source, Span <byte> destination, out int numCharsRead, out int numBytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) { int originalSourceLength = source.Length; int originalDestinationLength = destination.Length; OperationStatus status = OperationStatus.Done; // In a loop, this is going to read and transcode one scalar value at a time // from the source to the destination. while (!source.IsEmpty) { status = Rune.DecodeFromUtf16(source, out Rune firstScalarValue, out int charsConsumed); switch (status) { case OperationStatus.NeedMoreData: // Input buffer ended with a high surrogate. Only treat this as an error // if the caller told us that we shouldn't expect additional data in a // future call. if (!isFinalBlock) { goto Finish; } status = OperationStatus.InvalidData; goto case OperationStatus.InvalidData; case OperationStatus.InvalidData: // Input buffer contained invalid data. If the caller told us not to // perform U+FFFD replacement, terminate the loop immediately and return // an error to the caller. if (!replaceInvalidSequences) { goto Finish; } firstScalarValue = Rune.ReplacementChar; goto default; default: // We know which scalar value we need to transcode to UTF-8. // Do so now, and only terminate the loop if we ran out of space // in the destination buffer. if (firstScalarValue.TryEncodeToUtf8(destination, out int bytesWritten)) { source = source.Slice(charsConsumed); // don't use Rune.Utf8SequenceLength; we may have performed substitution destination = destination.Slice(bytesWritten); status = OperationStatus.Done; // forcibly set success continue; } else { status = OperationStatus.DestinationTooSmall; goto Finish; } } } Finish: numCharsRead = originalSourceLength - source.Length; numBytesWritten = originalDestinationLength - destination.Length; Debug.Assert((status == OperationStatus.Done) == (numCharsRead == originalSourceLength), "Should report OperationStatus.Done if and only if we've consumed the entire input buffer."); return(status); }
/// <summary> /// Peek at the next <see cref="Rune"/> without advancing the position /// </summary> /// <param name="charsConsumed">The amount of <see cref="Char"/> used by the <see cref="Rune"/>.</param> /// <returns>The next <see cref="Rune"/> in the <see cref="Source"/>.</returns> internal Rune PeekRune(out Int32 charsConsumed) { ReadOnlySpan <Char> peek = Peek(2); switch (Rune.DecodeFromUtf16(peek, out Rune result, out charsConsumed)) {