public void TestEndiannessWithtUtf16BE() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.BigEndianUnicode); Assert.AreEqual("\u0a00\u0000", penc.GetChars(_bytes1)); Assert.AreEqual("\u0000\u000a", penc.GetChars(_bytes2)); }
public void TestEndiannessWithtUtf32BE() { Encoding penc = new PythonSurrogateEscapeEncoding(new UTF32Encoding(bigEndian: true, byteOrderMark: false)); Assert.AreEqual("\udc0a\udc00\udc00\udc00", penc.GetChars(_bytes1)); Assert.AreEqual("\u000a", penc.GetChars(_bytes2)); }
public void TestEndiannessWithUtf32BE() { Encoding penc = new PythonSurrogateEscapeEncoding(new UTF32Encoding(bigEndian: true, byteOrderMark: false)); Assert.Throws <DecoderFallbackException>(() => penc.GetChars(_bytes1)); Assert.AreEqual("\u000a", penc.GetChars(_bytes2)); }
// Note: UTF-7 is not round-trip safe in general private static void TestRoundTrip(Encoding enc, byte[] bytes) { Encoding penc = new PythonSurrogateEscapeEncoding(enc); char[] chars1 = new char[penc.GetCharCount(bytes)]; penc.GetChars(bytes, 0, bytes.Length, chars1, 0); char[] chars2 = penc.GetChars(bytes); Assert.AreEqual(chars1, chars2); byte[] bytes1 = penc.GetBytes(chars1); byte[] bytes2 = new byte[penc.GetByteCount(chars1, 0, chars1.Length)]; penc.GetBytes(chars1, 0, chars1.Length, bytes2, 0); Assert.AreEqual(bytes1, bytes2); Assert.AreEqual(bytes, bytes1); }
public void TestCompare256WithUtf8() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.UTF8); char[] chars = penc.GetChars(bytes); string python_chars = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\udc80\udc81\udc82\udc83\udc84\udc85\udc86\udc87\udc88\udc89\udc8a\udc8b\udc8c\udc8d\udc8e\udc8f\udc90\udc91\udc92\udc93\udc94\udc95\udc96\udc97\udc98\udc99\udc9a\udc9b\udc9c\udc9d\udc9e\udc9f\udca0\udca1\udca2\udca3\udca4\udca5\udca6\udca7\udca8\udca9\udcaa\udcab\udcac\udcad\udcae\udcaf\udcb0\udcb1\udcb2\udcb3\udcb4\udcb5\udcb6\udcb7\udcb8\udcb9\udcba\udcbb\udcbc\udcbd\udcbe\udcbf\udcc0\udcc1\udcc2\udcc3\udcc4\udcc5\udcc6\udcc7\udcc8\udcc9\udcca\udccb\udccc\udccd\udcce\udccf\udcd0\udcd1\udcd2\udcd3\udcd4\udcd5\udcd6\udcd7\udcd8\udcd9\udcda\udcdb\udcdc\udcdd\udcde\udcdf\udce0\udce1\udce2\udce3\udce4\udce5\udce6\udce7\udce8\udce9\udcea\udceb\udcec\udced\udcee\udcef\udcf0\udcf1\udcf2\udcf3\udcf4\udcf5\udcf6\udcf7\udcf8\udcf9\udcfa\udcfb\udcfc\udcfd\udcfe\udcff"; Assert.AreEqual(python_chars, chars); }
public void TestCompare256WithLatin1() { Encoding penc = new PythonSurrogateEscapeEncoding(StringOps.Latin1Encoding); char[] chars = penc.GetChars(bytes); string python_chars = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0¡¢£¤¥¦§¨©ª«¬\xad®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"; Assert.AreEqual(python_chars, chars); }
public void TestWithUtf32() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.UTF32); char[] chars = penc.GetChars(bytes); char[] python_chars = (new[] { 0x0000dcd8, 0x0000dcd9, 0x0000dcda, 0x0000dcdb, 0x0000dcdc, 0x0000dcdd, 0x0000dcde, 0x0000dcdf }) .SelectMany(i => i <= 0xffff ? ((char)i).ToString() : char.ConvertFromUtf32(i)).ToArray(); Assert.AreEqual(python_chars, chars); // byte[] python_bytes = ??? - CPython fails on encoding the string it decoded itself; a bug in CPython? byte[] bytes1 = penc.GetBytes(chars); Assert.AreEqual(bytes, bytes1); }
public void TestCompare256Utf16() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.Unicode); char[] chars = penc.GetChars(bytes); char[] python_chars = (new[] { 0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0b0a, 0x0d0c, 0x0f0e, 0x1110, 0x1312, 0x1514, 0x1716, 0x1918, 0x1b1a, 0x1d1c, 0x1f1e, 0x2120, 0x2322, 0x2524, 0x2726, 0x2928, 0x2b2a, 0x2d2c, 0x2f2e, 0x3130, 0x3332, 0x3534, 0x3736, 0x3938, 0x3b3a, 0x3d3c, 0x3f3e, 0x4140, 0x4342, 0x4544, 0x4746, 0x4948, 0x4b4a, 0x4d4c, 0x4f4e, 0x5150, 0x5352, 0x5554, 0x5756, 0x5958, 0x5b5a, 0x5d5c, 0x5f5e, 0x6160, 0x6362, 0x6564, 0x6766, 0x6968, 0x6b6a, 0x6d6c, 0x6f6e, 0x7170, 0x7372, 0x7574, 0x7776, 0x7978, 0x7b7a, 0x7d7c, 0x7f7e, 0x8180, 0x8382, 0x8584, 0x8786, 0x8988, 0x8b8a, 0x8d8c, 0x8f8e, 0x9190, 0x9392, 0x9594, 0x9796, 0x9998, 0x9b9a, 0x9d9c, 0x9f9e, 0xa1a0, 0xa3a2, 0xa5a4, 0xa7a6, 0xa9a8, 0xabaa, 0xadac, 0xafae, 0xb1b0, 0xb3b2, 0xb5b4, 0xb7b6, 0xb9b8, 0xbbba, 0xbdbc, 0xbfbe, 0xc1c0, 0xc3c2, 0xc5c4, 0xc7c6, 0xc9c8, 0xcbca, 0xcdcc, 0xcfce, 0xd1d0, 0xd3d2, 0xd5d4, 0xd7d6, 0xdcd8, 0xdcd9, 0x1069dc, 0xdcde, 0xdcdf, 0xe1e0, 0xe3e2, 0xe5e4, 0xe7e6, 0xe9e8, 0xebea, 0xedec, 0xefee, 0xf1f0, 0xf3f2, 0xf5f4, 0xf7f6, 0xf9f8, 0xfbfa, 0xfdfc, 0xfffe }) .SelectMany(i => i <= 0xffff ? ((char)i).ToString() : char.ConvertFromUtf32(i)).ToArray(); Assert.AreEqual(python_chars, chars); // byte[] python_bytes = ??? - CPython fails to encode the string it decoded itself; a bug in CPython? byte[] bytes1 = penc.GetBytes(chars); Assert.AreEqual(bytes, bytes1); }
public void TestCompare256WithUtf7() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.UTF7); // The following Python output is produced with python 3.4 but is not correct: it is missing the '+' character string python_chars = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\udc80\udc81\udc82\udc83\udc84\udc85\udc86\udc87\udc88\udc89\udc8a\udc8b\udc8c\udc8d\udc8e\udc8f\udc90\udc91\udc92\udc93\udc94\udc95\udc96\udc97\udc98\udc99\udc9a\udc9b\udc9c\udc9d\udc9e\udc9f\udca0\udca1\udca2\udca3\udca4\udca5\udca6\udca7\udca8\udca9\udcaa\udcab\udcac\udcad\udcae\udcaf\udcb0\udcb1\udcb2\udcb3\udcb4\udcb5\udcb6\udcb7\udcb8\udcb9\udcba\udcbb\udcbc\udcbd\udcbe\udcbf\udcc0\udcc1\udcc2\udcc3\udcc4\udcc5\udcc6\udcc7\udcc8\udcc9\udcca\udccb\udccc\udccd\udcce\udccf\udcd0\udcd1\udcd2\udcd3\udcd4\udcd5\udcd6\udcd7\udcd8\udcd9\udcda\udcdb\udcdc\udcdd\udcde\udcdf\udce0\udce1\udce2\udce3\udce4\udce5\udce6\udce7\udce8\udce9\udcea\udceb\udcec\udced\udcee\udcef\udcf0\udcf1\udcf2\udcf3\udcf4\udcf5\udcf6\udcf7\udcf8\udcf9\udcfa\udcfb\udcfc\udcfd\udcfe\udcff"; // Our implementation will refuse to decode (correctly) because the ',' after '+' is not valid thus requires escaping, // but escaping of chars under 128 is not allowed. Assert.Throws <DecoderFallbackException>(() => penc.GetChars(bytes)); // Let's try again without the '+' bytes = bytes.Where(i => i != (byte)'+').ToArray(); char[] chars = penc.GetChars(bytes); Assert.AreEqual(python_chars, chars); // Now the encoding part byte[] encoded_bytes = penc.GetBytes(chars); byte[] expected_bytes = "+AAAAAQACAAMABAAFAAYABwAI-\t\n+AAsADA-\r+AA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAf- +ACEAIgAjACQAJQAm-'()+ACo-,-./0123456789:+ADsAPAA9AD4-?+AEA-ABCDEFGHIJKLMNOPQRSTUVWXYZ+AFsAXABdAF4AXwBg-abcdefghijklmnopqrstuvwxyz+AHsAfAB9AH4Af9yA3IHcgtyD3ITchdyG3IfciNyJ3Irci9yM3I3cjtyP3JDckdyS3JPclNyV3Jbcl9yY3Jncmtyb3Jzcndye3J/coNyh3KLco9yk3KXcptyn3Kjcqdyq3KvcrNyt3K7cr9yw3LHcstyz3LTctdy23LfcuNy53Lrcu9y83L3cvty/3MDcwdzC3MPcxNzF3Mbcx9zI3MncytzL3MzczdzO3M/c0NzR3NLc09zU3NXc1tzX3Njc2dza3Nvc3Nzd3N7c39zg3OHc4tzj3OTc5dzm3Ofc6Nzp3Orc69zs3O3c7tzv3PDc8dzy3PPc9Nz13Pbc99z43Pnc+tz73Pzc/dz+3P8-" .Select(c => (byte)c).ToArray(); Assert.AreEqual(expected_bytes, encoded_bytes); // Encoding the given chars with CPython produces the following byte string byte[] python_bytes = "+AAAAAQACAAMABAAFAAYABwAI\t\n+AAsADA\r+AA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAf !\"#$%&'()*,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[+AFw]^_`abcdefghijklmnopqrstuvwxyz{|}+AH4Af9yA3IHcgtyD3ITchdyG3IfciNyJ3Irci9yM3I3cjtyP3JDckdyS3JPclNyV3Jbcl9yY3Jncmtyb3Jzcndye3J/coNyh3KLco9yk3KXcptyn3Kjcqdyq3KvcrNyt3K7cr9yw3LHcstyz3LTctdy23LfcuNy53Lrcu9y83L3cvty/3MDcwdzC3MPcxNzF3Mbcx9zI3MncytzL3MzczdzO3M/c0NzR3NLc09zU3NXc1tzX3Njc2dza3Nvc3Nzd3N7c39zg3OHc4tzj3OTc5dzm3Ofc6Nzp3Orc69zs3O3c7tzv3PDc8dzy3PPc9Nz13Pbc99z43Pnc+tz73Pzc/dz+3P8-" .Select(c => (byte)c).ToArray(); // The sequences expected_bytes and python_bytes are NOT equal: .NET ends encoded blocks (starting with '+') with '-' // and encodes some additional characters, like !"#$%&*;<=>@{|} // Encoding those characters is optional, and terminating the encoded blocks with '-' is also optional. // CPython does not do it, resulting in a more compact encoding. // However, they both decode to the same text, although, again, CPython's version cannot be decoded using surrogateescape char[] dotnet_decoded = penc.GetChars(encoded_bytes); char[] python_decoded = Encoding.UTF7.GetChars(python_bytes); Assert.AreEqual(chars, python_decoded); Assert.AreEqual(chars, dotnet_decoded); dotnet_decoded = Encoding.UTF7.GetChars(encoded_bytes); Assert.AreEqual(chars, dotnet_decoded); }
public void TestCompare256WithWindows1252() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.GetEncoding(1252)); Assert.AreEqual("iso-8859-1-surrogateescape", penc.WebName); char[] chars = penc.GetChars(bytes); string python_chars = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f€\udc81‚ƒ„…†‡ˆ‰Š‹Œ\udc8dŽ\udc8f\udc90‘’“”•–—˜™š›œ\udc9džŸ\xa0¡¢£¤¥¦§¨©ª«¬\xad®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"; string encoded = new string(chars); Assert.AreEqual(python_chars.Length, encoded.Length); for (int i = 0; i < encoded.Length; i++) { if (encoded[i] != python_chars[i]) { // Known differences between Windows and Python (Unicode) implementation of Windows-1252 // https://en.wikipedia.org/wiki/Windows-1252 CollectionAssert.Contains(new[] { 0x81, 0x8d, 0x8f, 0x90, 0x9d }, i); } } }