/// <summary> /// Calculates the byte count needed to encode the UTF-16 bytes from the specified UTF-8 sequence. /// /// This method will consume as many of the input bytes as possible. /// </summary> /// <param name="source">A span containing a sequence of UTF-8 bytes.</param> /// <param name="bytesNeeded">On exit, contains the number of bytes required for encoding from the <paramref name="source"/>.</param> /// <returns>A <see cref="TransformationStatus"/> value representing the expected state of the conversion.</returns> public unsafe static TransformationStatus ComputeEncodedBytesFromUtf8(ReadOnlySpan <byte> source, out int bytesNeeded) { fixed(byte *pUtf8 = &source.DangerousGetPinnableReference()) { byte *pSrc = pUtf8; byte *pSrcEnd = pSrc + source.Length; bytesNeeded = 0; int ch = 0; while (pSrc < pSrcEnd) { int availableBytes = EncodingHelper.PtrDiff(pSrcEnd, pSrc); // don't fall into the fast decoding loop if we don't have enough bytes if (availableBytes <= 13) { // try to get over the remainder of the ascii characters fast though byte *pLocalEnd = pSrc + availableBytes; while (pSrc < pLocalEnd) { ch = *pSrc; pSrc++; if (ch > 0x7F) { goto LongCodeSlow; } bytesNeeded++; } // we are done break; } // To compute the upper bound, assume that all characters are ASCII characters at this point, // the boundary will be decreased for every non-ASCII character we encounter // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences byte *pStop = pSrc + availableBytes - 7; // Fast loop while (pSrc < pStop) { ch = *pSrc; pSrc++; if (ch > 0x7F) { goto LongCode; } bytesNeeded++; // 2-byte align if ((unchecked ((int)pSrc) & 0x1) != 0) { ch = *pSrc; pSrc++; if (ch > 0x7F) { goto LongCode; } bytesNeeded++; } // 4-byte align if ((unchecked ((int)pSrc) & 0x2) != 0) { ch = *(ushort *)pSrc; if ((ch & 0x8080) != 0) { goto LongCodeWithMask16; } pSrc += 2; bytesNeeded += 2; } // Run 8 characters at a time! while (pSrc < pStop) { ch = *(int *)pSrc; int chb = *(int *)(pSrc + 4); if (((ch | chb) & unchecked ((int)0x80808080)) != 0) { goto LongCodeWithMask32; } pSrc += 8; bytesNeeded += 8; } break; #if BIGENDIAN LongCodeWithMask32: // be careful about the sign extension ch = (int)(((uint)ch) >> 16); LongCodeWithMask16: ch = (int)(((uint)ch) >> 8); #else // BIGENDIAN LongCodeWithMask32: LongCodeWithMask16: ch &= 0xFF; #endif // BIGENDIAN pSrc++; if (ch <= 0x7F) { bytesNeeded++; continue; } LongCode: int chc = *pSrc; pSrc++; // Bit 6 should be 0, and trailing byte should be 10vvvvvv if ((ch & 0x40) == 0 || (chc & unchecked ((sbyte)0xC0)) != 0x80) { goto InvalidData; } chc &= 0x3F; if ((ch & 0x20) != 0) { // Handle 3 or 4 byte encoding. // Fold the first 2 bytes together chc |= (ch & 0x0F) << 6; if ((ch & 0x10) != 0) { // 4 byte - surrogate pair ch = *pSrc; // Bit 4 should be zero + the surrogate should be in the range 0x000000 - 0x10FFFF // and the trailing byte should be 10vvvvvv if (!EncodingHelper.InRange(chc >> 4, 0x01, 0x10) || (ch & unchecked ((sbyte)0xC0)) != 0x80) { goto InvalidData; } // Merge 3rd byte then read the last byte chc = (chc << 6) | (ch & 0x3F); ch = *(pSrc + 1); // The last trailing byte still holds the form 10vvvvvv if ((ch & unchecked ((sbyte)0xC0)) != 0x80) { goto InvalidData; } pSrc += 2; ch = (chc << 6) | (ch & 0x3F); bytesNeeded++; ch = (ch & 0x3FF) + unchecked ((short)(EncodingHelper.LowSurrogateStart)); } else { // 3 byte encoding ch = *pSrc; // Check for non-shortest form of 3 byte sequence // No surrogates // Trailing byte must be in the form 10vvvvvv if ((chc & (0x1F << 5)) == 0 || (chc & (0xF800 >> 6)) == (0xD800 >> 6) || (ch & unchecked ((sbyte)0xC0)) != 0x80) { goto InvalidData; } pSrc++; ch = (chc << 6) | (ch & 0x3F); } // extra byte, we're already planning 2 chars for 2 of these bytes, // but the big loop is testing the target against pStop, so we need // to subtract 2 more or we risk overrunning the input. Subtract // one here and one below. pStop--; } else { // 2 byte encoding ch &= 0x1F; // Check for non-shortest form if (ch <= 1) { goto InvalidData; } ch = (ch << 6) | chc; } bytesNeeded++; // extra byte, we're only expecting 1 char for each of these 2 bytes, // but the loop is testing the target (not source) against pStop. // subtract an extra count from pStop so that we don't overrun the input. pStop--; } continue; LongCodeSlow: if (pSrc >= pSrcEnd) { // This is a special case where hit the end of the buffer but are in the middle // of decoding a long code. The error exit thinks we have read 2 extra bytes already, // so we add +1 to pSrc to get the count correct for the bytes consumed value. pSrc++; goto NeedMoreData; } int chd = *pSrc; pSrc++; // Bit 6 should be 0, and trailing byte should be 10vvvvvv if ((ch & 0x40) == 0 || (chd & unchecked ((sbyte)0xC0)) != 0x80) { goto InvalidData; } chd &= 0x3F; if ((ch & 0x20) != 0) { // Handle 3 or 4 byte encoding. // Fold the first 2 bytes together chd |= (ch & 0x0F) << 6; if ((ch & 0x10) != 0) { // 4 byte - surrogate pair // We need 2 more bytes if (pSrc >= pSrcEnd - 1) { goto NeedMoreData; } ch = *pSrc; // Bit 4 should be zero + the surrogate should be in the range 0x000000 - 0x10FFFF // and the trailing byte should be 10vvvvvv if (!EncodingHelper.InRange(chd >> 4, 0x01, 0x10) || (ch & unchecked ((sbyte)0xC0)) != 0x80) { goto InvalidData; } // Merge 3rd byte then read the last byte chd = (chd << 6) | (ch & 0x3F); ch = *(pSrc + 1); // The last trailing byte still holds the form 10vvvvvv // We only know for sure we have room for one more char, but we need an extra now. if ((ch & unchecked ((sbyte)0xC0)) != 0x80) { goto InvalidData; } pSrc += 2; ch = (chd << 6) | (ch & 0x3F); bytesNeeded++; ch = (ch & 0x3FF) + unchecked ((short)(EncodingHelper.LowSurrogateStart)); } else { // 3 byte encoding if (pSrc >= pSrcEnd) { goto NeedMoreData; } ch = *pSrc; // Check for non-shortest form of 3 byte sequence // No surrogates // Trailing byte must be in the form 10vvvvvv if ((chd & (0x1F << 5)) == 0 || (chd & (0xF800 >> 6)) == (0xD800 >> 6) || (ch & unchecked ((sbyte)0xC0)) != 0x80) { goto InvalidData; } pSrc++; ch = (chd << 6) | (ch & 0x3F); } } else { // 2 byte encoding ch &= 0x1F; // Check for non-shortest form if (ch <= 1) { goto InvalidData; } ch = (ch << 6) | chd; } bytesNeeded++; } bytesNeeded <<= 1; // Count we have is chars, double for bytes. return(EncodingHelper.PtrDiff(pSrcEnd, pSrc) == 0 ? TransformationStatus.Done : TransformationStatus.DestinationTooSmall); NeedMoreData: bytesNeeded <<= 1; // Count we have is chars, double for bytes. return(TransformationStatus.NeedMoreSourceData); InvalidData: bytesNeeded <<= 1; // Count we have is chars, double for bytes. return(TransformationStatus.InvalidData); } }