public override string ToString() { // get length first // TODO: Optimize for characters of length 1 or 2 in UTF-8 representation (no need to read anything) // TODO: is compiler gonna do the right thing here? // TODO: Should we use Linq's Count()? int len = 0; foreach (var codePoint in CodePoints) { len++; if (UnicodeCodePoint.IsSurrogate(codePoint)) { len++; } } unsafe { Span <byte> buffer; char * stackChars = null; char[] characters = null; if (len <= 256) { char *stackallocedChars = stackalloc char[len]; stackChars = stackallocedChars; buffer = new Span <byte>(stackChars, len * 2); } else { // HACK: Can System.Buffers be used here? characters = new char[len]; buffer = characters.Slice().Cast <char, byte>(); } foreach (var codePoint in CodePoints) { int bytesEncoded; if (!Utf16LittleEndianEncoder.TryEncodeCodePoint(codePoint, buffer, out bytesEncoded)) { // TODO: Change Exception type throw new Exception("invalid character"); } buffer = buffer.Slice(bytesEncoded); } // TODO: We already have a char[] and this will copy, how to avoid that return(stackChars != null ? new string(stackChars, 0, len) : new string(characters)); } }
public static bool TryDecodeCodePoint(Span <byte> buffer, out UnicodeCodePoint codePoint, out int encodedBytes) { if (buffer.Length < 2) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); // buffer too small return(false); } uint codePointValue = (uint)buffer[0] | ((uint)buffer[1] << 8); encodedBytes = 2; if (UnicodeCodePoint.IsSurrogate((UnicodeCodePoint)codePointValue)) { // TODO: Check if compiler optimized it so codePointValue low range is checked only once if (!UnicodeCodePoint.IsHighSurrogate((UnicodeCodePoint)codePointValue) || buffer.Length < 4) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); // invalid high surrogate or buffer too small return(false); } unchecked { codePointValue -= UnicodeConstants.Utf16HighSurrogateFirstCodePoint; encodedBytes += 2; } // high surrogate contains 10 first bits of the code point codePointValue <<= 10; uint lowSurrogate = (uint)buffer[2] | ((uint)buffer[3] << 8); if (!UnicodeCodePoint.IsLowSurrogate((UnicodeCodePoint)lowSurrogate)) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); // invalid low surrogate character return(false); } unchecked { lowSurrogate -= UnicodeConstants.Utf16LowSurrogateFirstCodePoint; } codePointValue |= lowSurrogate; } codePoint = (UnicodeCodePoint)codePointValue; return(true); }
public override string ToString() { // get length first // TODO: Optimize for characters of length 1 or 2 in UTF-8 representation (no need to read anything) // TODO: is compiler gonna do the right thing here? // TODO: Should we use Linq's Count()? int len = 0; foreach (var codePoint in CodePoints) { len++; if (UnicodeCodePoint.IsSurrogate(codePoint)) { len++; } } char[] characters = new char[len]; unsafe { fixed(char *pinnedCharacters = characters) { Span <byte> buffer = new Span <byte>((byte *)pinnedCharacters, len * 2); foreach (var codePoint in CodePoints) { int bytesEncoded; if (!Utf16LittleEndianEncoder.TryEncodeCodePoint(codePoint, buffer, out bytesEncoded)) { // TODO: Change Exception type throw new Exception("invalid character"); } buffer = buffer.Slice(bytesEncoded); } } } // TODO: We already have a char[] and this will copy, how to avoid that return(new string(characters)); }