public static int ComputeEncodedBytes(ReadOnlySpan <char> utf16source) { Span <byte> utf8Destination; unsafe { byte *buffer = stackalloc byte[32]; utf8Destination = new Span <byte>(buffer, 32); } var utf16Bytes = utf16source.Cast <char, byte>(); int encodedBytes = 0; for (int i = 0; i < utf16Bytes.Length; /* intentionally no increment */) { UnicodeCodePoint codePoint; int consumedBytes; if (Utf16LittleEndianEncoder.TryDecodeCodePoint(utf16Bytes.Slice(i), out codePoint, out consumedBytes)) { i += consumedBytes; int justEncodedBytes; if (TryEncodeCodePoint(codePoint, utf8Destination, out justEncodedBytes)) { encodedBytes += justEncodedBytes; } else { throw new NotImplementedException("this should resize the buffer"); } } else { throw new ArgumentOutOfRangeException(nameof(utf16source)); } } return(encodedBytes); }
// TODO: this routime needs to be optimized. public static bool TryEncode(ReadOnlySpan <char> utf16source, Span <byte> utf8Destination, out int encodedBytes) { var utf16Bytes = utf16source.Cast <char, byte>(); encodedBytes = 0; for (int i = 0; i < utf16Bytes.Length; /* intentionally no increment */) { UnicodeCodePoint codePoint; int consumedBytes; if (Utf16LittleEndianEncoder.TryDecodeCodePoint(utf16Bytes.Slice(i), out codePoint, out consumedBytes)) { i += consumedBytes; int justEncodedBytes; if (TryEncodeCodePoint(codePoint, utf8Destination, out justEncodedBytes)) { utf8Destination = utf8Destination.Slice(justEncodedBytes); encodedBytes += justEncodedBytes; } else { return(false); } } else { throw new ArgumentOutOfRangeException(nameof(utf16source)); } } return(true); }
public unsafe ValueString CreateValueStringFromUtf8(byte[] arr, int start, int length) { if (start + length > arr.Length) { throw new ArgumentException(); } EnsureSpace(length * 2); fixed(char *ptr = str) { var ptrx = ptr + used; var offset = used; foreach (var codePoint in new Utf8String.CodePointEnumerable(arr, start, length)) { int charsEncoded; if (!Utf16LittleEndianEncoder.TryEncodeCodePoint(codePoint, ptrx, out charsEncoded)) { // TODO: Change Exception type throw new Exception("invalid character"); } ptrx += charsEncoded; } var strlen = (int)(ptrx - ptr - used); used += strlen; return(new ValueString(str, offset, strlen)); } }
private const byte b1111_1000U = 0xF8; //248 #endregion Constants #region Decoding implementation /// <summary> /// Decodes a span of UTF-8 characters into UTF-16. /// /// This method will consume as many of the input characters as possible. /// /// On successful exit, the entire input was consumed and encoded successfully. In this case, <paramref name="bytesConsumed"/> will be /// equal to the length of the <paramref name="utf8"/> and <paramref name="charactersWritten"/> will equal the total number of bytes written to /// the <paramref name="utf16"/>. /// /// On unsuccessful exit, the following conditions can exist. /// 1) If the output buffer has been filled and no more input characters can be encoded, another call to this method with the input sliced to /// exclude the already encoded characters (using <paramref name="bytesConsumed"/>) and a new output buffer will continue the encoding. /// 2) Encoding may have also stopped because the input buffer contains an invalid sequence. /// </summary> /// <param name="utf8">A span containing a sequence of UTF-8 characters.</param> /// <param name="utf16">A span to write the UTF-16 data into.</param> /// <param name="bytesConsumed">On exit, contains the number of code points that were consumed from the UTF-16 character span.</param> /// <param name="charactersWritten">An output parameter to store the number of characters written to <paramref name="utf16"/></param> /// <returns>True if the input buffer was fully encoded into the output buffer, otherwise false.</returns> public static bool TryDecode(ReadOnlySpan <byte> utf8, Span <char> utf16, out int bytesConsumed, out int charactersWritten) { bytesConsumed = 0; charactersWritten = 0; while (bytesConsumed < utf8.Length) { uint codePoint; int consumed; if (!TryDecodeCodePoint(utf8, bytesConsumed, out codePoint, out consumed)) { return(false); } int written; if (!Utf16LittleEndianEncoder.TryEncode(codePoint, utf16, charactersWritten, out written)) { return(false); } charactersWritten += written; bytesConsumed += consumed; } return(true); }
public override string ToString() { // get length first // TODO: Optimize for characters of length 1 or 2 in UTF-8 representation (no need to read anything) // TODO: is compiler gonna do the right thing here? // TODO: Should we use Linq's Count()? int len = 0; foreach (var codePoint in CodePoints) { len++; if (!UnicodeCodePoint.IsBmp(codePoint)) { len++; } } unsafe { Span <byte> buffer; char * stackChars = null; char[] characters = null; if (len <= 256) { char *stackallocedChars = stackalloc char[len]; stackChars = stackallocedChars; buffer = new Span <byte>(stackChars, len * 2); } else { // HACK: Can System.Buffers be used here? characters = new char[len]; buffer = characters.Slice().Cast <char, byte>(); } foreach (var codePoint in CodePoints) { int bytesEncoded; if (!Utf16LittleEndianEncoder.TryEncodeCodePoint(codePoint, buffer, out bytesEncoded)) { // TODO: Change Exception type throw new Exception("invalid character"); } buffer = buffer.Slice(bytesEncoded); } // TODO: We already have a char[] and this will copy, how to avoid that return(stackChars != null ? new string(stackChars, 0, len) : new string(characters)); } }
public override string ToString() { // get length first // TODO: Optimize for characters of length 1 or 2 in UTF-8 representation (no need to read anything) // TODO: is compiler gonna do the right thing here? // TODO: Should we use Linq's Count()? int len = 0; foreach (var codePoint in CodePoints) { len++; if (UnicodeCodePoint.IsSurrogate(codePoint)) { len++; } } char[] characters = new char[len]; unsafe { fixed(char *pinnedCharacters = characters) { Span <byte> buffer = new Span <byte>((byte *)pinnedCharacters, len * 2); foreach (var codePoint in CodePoints) { int bytesEncoded; if (!Utf16LittleEndianEncoder.TryEncodeCodePoint(codePoint, buffer, out bytesEncoded)) { // TODO: Change Exception type throw new Exception("invalid character"); } buffer = buffer.Slice(bytesEncoded); } } } // TODO: We already have a char[] and this will copy, how to avoid that return(new string(characters)); }
private static void AppendStringInternal(this IWriter writer, string value, int startIndex, int endIndex) { for (var i = startIndex; i <= endIndex;) { UnicodeCodePoint codePoint; int encodedChars; var success = Utf16LittleEndianEncoder.TryDecodeCodePointFromString(value, i, out codePoint, out encodedChars); if (!success) { throw new ArgumentException(); } i += encodedChars; int encodedBytes; success = Utf8Encoder.TryEncodeCodePoint( codePoint, writer.GetFreeBuffer(MaxUtf8CodePointBytes).ToSpan(), out encodedBytes); Debug.Assert(success); writer.CommitBytes(encodedBytes); } }
// TODO: This should return Utf16CodeUnits which should wrap byte[]/Span<byte>, same for other encoders private static byte[] GetUtf8BytesFromString(string s) { int len = 0; for (int i = 0; i < s.Length; /* intentionally no increment */) { UnicodeCodePoint codePoint; int encodedChars; if (!Utf16LittleEndianEncoder.TryDecodeCodePointFromString(s, i, out codePoint, out encodedChars)) { throw new ArgumentException("s", "Invalid surrogate pair in the string."); } if (encodedChars <= 0) { // TODO: Fix exception type throw new Exception("internal error"); } int encodedBytes = Utf8Encoder.GetNumberOfEncodedBytes(codePoint); if (encodedBytes == 0) { // TODO: Fix exception type throw new Exception("Internal error: Utf16Decoder somehow got CodePoint out of range"); } len += encodedBytes; i += encodedChars; } byte[] bytes = new byte[len]; unsafe { fixed(byte *array_pinned = bytes) { Span <byte> p = new Span <byte>(array_pinned, len); for (int i = 0; i < s.Length; /* intentionally no increment */) { UnicodeCodePoint codePoint; int encodedChars; if (Utf16LittleEndianEncoder.TryDecodeCodePointFromString(s, i, out codePoint, out encodedChars)) { i += encodedChars; int encodedBytes; if (Utf8Encoder.TryEncodeCodePoint(codePoint, p, out encodedBytes)) { p = p.Slice(encodedBytes); } else { // TODO: Fix exception type throw new Exception("Internal error: Utf16Decoder somehow got CodePoint out of range or the buffer is too small"); } } else { // TODO: Fix exception type throw new Exception("Internal error: we did pre-validation of the string, nothing should go wrong"); } } } } return(bytes); }