public void EncodeUtf8_EmptyInput_AlwaysSucceeds() { // Arrange var encoder = new ConfigurableScalarTextEncoder(_ => false /* disallow everything */); // Act & assert Assert.Equal(OperationStatus.Done, encoder.EncodeUtf8(ReadOnlySpan <byte> .Empty, Span <byte> .Empty, out int bytesConsumed, out int bytesWritten, isFinalBlock: true)); Assert.Equal(0, bytesConsumed); Assert.Equal(0, bytesWritten); Assert.Equal(OperationStatus.Done, encoder.EncodeUtf8(ReadOnlySpan <byte> .Empty, Span <byte> .Empty, out bytesConsumed, out bytesWritten, isFinalBlock: false)); Assert.Equal(0, bytesConsumed); Assert.Equal(0, bytesWritten); }
public void EncodeUtf8_WellFormedInput_DoesNotRequireEncoding_CopiedToDestinationCorrectly(int destinationSize, int expectedBytesCopied) { // This test considers input which is well-formed and doesn't need to be encoded. // If the destination buffer is large enough, the data should be copied in its entirety. // If the destination buffer is too small, only complete UTF-8 subsequences should be copied. // We should never copy a partial subsequence, as it would cause a future call to EncodeUtf8 // to misinterpret the data as ill-formed. // Arrange byte[] fullUtf8Input = new byte[] { 0xC2, 0x82, 0x40, 0xE2, 0x90, 0x91, 0xF3, 0xA0, 0xA1, 0xA2, 0x50 }; // UTF-8 subsequences of varying length var encoder = new ConfigurableScalarTextEncoder(_ => true /* allow everything */); // Act & assert OperationStatus expectedOpStatus = (expectedBytesCopied == fullUtf8Input.Length) ? OperationStatus.Done : OperationStatus.DestinationTooSmall; byte[] destination = new byte[destinationSize]; Assert.Equal(expectedOpStatus, encoder.EncodeUtf8(fullUtf8Input, destination, out int bytesConsumed, out int bytesWritten, isFinalBlock: true)); Assert.Equal(expectedBytesCopied, bytesConsumed); Assert.Equal(expectedBytesCopied, bytesWritten); // bytes written should match bytes consumed if no encoding needs to take place Assert.Equal(fullUtf8Input.AsSpan(0, bytesConsumed).ToArray(), destination.AsSpan(0, bytesWritten).ToArray()); // ensure byte-for-byte copy Assert.True(destination.AsSpan(bytesWritten).ToArray().All(el => el == 0)); // all remaining bytes should be unchanged destination = new byte[destinationSize]; Assert.Equal(expectedOpStatus, encoder.EncodeUtf8(fullUtf8Input, destination, out bytesConsumed, out bytesWritten, isFinalBlock: false)); Assert.Equal(expectedBytesCopied, bytesConsumed); Assert.Equal(expectedBytesCopied, bytesWritten); // bytes written should match bytes consumed if no encoding needs to take place Assert.Equal(fullUtf8Input.AsSpan(0, bytesConsumed).ToArray(), destination.AsSpan(0, bytesWritten).ToArray()); // ensure byte-for-byte copy Assert.True(destination.AsSpan(bytesWritten).ToArray().All(el => el == 0)); // all remaining bytes should be unchanged }
public void EncodeUtf8_MixedInputWhichRequiresEncodingOrReplacement() { // Arrange var fullInput = new[] { new { utf8Bytes = new byte[] { 0x40 }, output = "@" }, new { utf8Bytes = new byte[] { 0xC3, 0x85 }, output = "[00C5]" }, // U+00C5 LATIN CAPITAL LETTER A WITH RING ABOVE (encoded since odd scalar value) new { utf8Bytes = new byte[] { 0xC3, 0x86 }, output = "\u00C6" }, // U+00C6 LATIN CAPITAL LETTER AE (on allow list) new { utf8Bytes = new byte[] { 0xFF }, output = "[FFFD]" }, // (invalid UTF-8, replaced with encoded form of U+FFFD) new { utf8Bytes = new byte[] { 0xEF, 0xBF, 0xBD }, output = "[FFFD]" }, // U+FFFD REPLACEMENT CHARACTER (encoded since not on allow list) new { utf8Bytes = new byte[] { 0xF0, 0x90, 0x82, 0x82 }, output = "\U00010082" }, // U+10082 LINEAR B IDEOGRAM B104 DEER (not encoded since on allow list) new { utf8Bytes = new byte[] { 0xF0, 0x90, 0x82, 0x83 }, output = "[10083]" }, // U+10083 LINEAR B IDEOGRAM B105 EQUID (encoded since not on allow list) }; var encoder = new ConfigurableScalarTextEncoder(scalarValue => (scalarValue % 2) == 0 /* allow only even-valued scalars to be represented unescaped */); // Act & assert List <byte> aggregateInputBytesSoFar = new List <byte>(); List <byte> expectedOutputBytesSoFar = new List <byte>(); foreach (var entry in fullInput) { int aggregateInputByteCountAtStartOfLoop = aggregateInputBytesSoFar.Count; byte[] destination; int bytesConsumed, bytesWritten; for (int i = 0; i < entry.utf8Bytes.Length - 1; i++) { aggregateInputBytesSoFar.Add(entry.utf8Bytes[i]); // If not final block, partial encoding should say "needs more data". // We'll try with various destination lengths just to make sure it doesn't affect result. foreach (int destinationLength in new[] { expectedOutputBytesSoFar.Count, expectedOutputBytesSoFar.Count + 1024 }) { destination = new byte[destinationLength]; Assert.Equal(OperationStatus.NeedMoreData, encoder.EncodeUtf8(aggregateInputBytesSoFar.ToArray(), destination, out bytesConsumed, out bytesWritten, isFinalBlock: false)); Assert.Equal(aggregateInputByteCountAtStartOfLoop, bytesConsumed); Assert.Equal(expectedOutputBytesSoFar.Count, bytesWritten); Assert.Equal(expectedOutputBytesSoFar.ToArray(), new Span <byte>(destination, 0, expectedOutputBytesSoFar.Count).ToArray()); } // Now try it with "isFinalBlock = true" to force the U+FFFD conversion destination = new byte[expectedOutputBytesSoFar.Count]; // first with not enough output space to write "[FFFD]" Assert.Equal(OperationStatus.DestinationTooSmall, encoder.EncodeUtf8(aggregateInputBytesSoFar.ToArray(), destination, out bytesConsumed, out bytesWritten, isFinalBlock: true)); Assert.Equal(aggregateInputByteCountAtStartOfLoop, bytesConsumed); Assert.Equal(expectedOutputBytesSoFar.Count, bytesWritten); Assert.Equal(expectedOutputBytesSoFar.ToArray(), new Span <byte>(destination, 0, expectedOutputBytesSoFar.Count).ToArray()); destination = new byte[expectedOutputBytesSoFar.Count + 1024]; // then with enough output space to write "[FFFD]" Assert.Equal(OperationStatus.Done, encoder.EncodeUtf8(aggregateInputBytesSoFar.ToArray(), destination, out bytesConsumed, out bytesWritten, isFinalBlock: true)); Assert.Equal(aggregateInputBytesSoFar.Count, bytesConsumed); Assert.Equal(expectedOutputBytesSoFar.Count + "[FFFD]".Length, bytesWritten); Assert.Equal(expectedOutputBytesSoFar.Concat(Encoding.UTF8.GetBytes("[FFFD]")).ToArray(), new Span <byte>(destination, 0, expectedOutputBytesSoFar.Count + "[FFFD]".Length).ToArray()); } // Consume the remainder of this entry and make sure it escaped properly (if needed). aggregateInputBytesSoFar.Add(entry.utf8Bytes.Last()); // First with not enough space in the destination buffer. destination = new byte[expectedOutputBytesSoFar.Count + Encoding.UTF8.GetByteCount(entry.output) - 1]; Assert.Equal(OperationStatus.DestinationTooSmall, encoder.EncodeUtf8(aggregateInputBytesSoFar.ToArray(), destination, out bytesConsumed, out bytesWritten, isFinalBlock: true)); Assert.Equal(aggregateInputByteCountAtStartOfLoop, bytesConsumed); Assert.Equal(expectedOutputBytesSoFar.Count, bytesWritten); Assert.Equal(expectedOutputBytesSoFar.ToArray(), new Span <byte>(destination, 0, expectedOutputBytesSoFar.Count).ToArray()); // Then with exactly enough space in the destination buffer, // and again with more than enough space in the destination buffer. expectedOutputBytesSoFar.AddRange(Encoding.UTF8.GetBytes(entry.output)); foreach (int destinationLength in new[] { expectedOutputBytesSoFar.Count, expectedOutputBytesSoFar.Count + 1024 }) { destination = new byte[destinationLength]; Assert.Equal(OperationStatus.Done, encoder.EncodeUtf8(aggregateInputBytesSoFar.ToArray(), destination, out bytesConsumed, out bytesWritten, isFinalBlock: false)); Assert.Equal(aggregateInputBytesSoFar.Count, bytesConsumed); Assert.Equal(expectedOutputBytesSoFar.Count, bytesWritten); Assert.Equal(expectedOutputBytesSoFar.ToArray(), new Span <byte>(destination, 0, expectedOutputBytesSoFar.Count).ToArray()); } } }