public void TestUtf32BE() { Encoding penc = new PythonSurrogatePassEncoding(new UTF32Encoding(bigEndian: true, byteOrderMark: false)); // lone high surrogate Assert.AreEqual("\ud810", penc.GetChars("\x00\x00\xd8\x10".AsBytes())); // lone low surrogate Assert.AreEqual("\udc0a", penc.GetChars("\x00\x00\xdc\n".AsBytes())); // invalid surrogate pair (low, high) Assert.AreEqual("\ude51\uda2f", penc.GetChars("\x00\x00\xdeQ\x00\x00\xda/".AsBytes())); }
public void TestUtf16BE() { Encoding penc = new PythonSurrogatePassEncoding(Encoding.BigEndianUnicode); // lone high surrogate Assert.AreEqual("\ud810", penc.GetChars("\xd8\x10".AsBytes())); // lone low surrogate Assert.AreEqual("\udc0a", penc.GetChars("\xdc\n".AsBytes())); // invalid surrogate pair (low, high) Assert.AreEqual("\ude51\uda2f", penc.GetChars("\xdeQ\xda/".AsBytes())); }
public void TestUtf16LE() { Encoding penc = new PythonSurrogatePassEncoding(Encoding.Unicode); // lone high surrogate Assert.AreEqual("\ud810", penc.GetChars("\x10\xd8".AsBytes())); // lone low surrogate Assert.AreEqual("\udc0a", penc.GetChars("\n\xdc".AsBytes())); // invalid surrogate pair (low, high) Assert.AreEqual("\ude51\uda2f", penc.GetChars("Q\xde/\xda".AsBytes())); }
public void TestUtf7() { // "surrogatepass" is not supported for UTF-7 per se, // but UTF-7 is supposed to decode any surogate characters from its ASCII mangled form // without requiring any fallback support Encoding penc = new PythonSurrogatePassEncoding(new UTF7Encoding(allowOptionals: true)); // lone high surrogate Assert.AreEqual("abc\ud810xyz", penc.GetChars("abc+2BA-xyz".AsBytes())); // lone low surrogate Assert.AreEqual("abc\udc0axyz", penc.GetChars("abc+3Ao-xyz".AsBytes())); // invalid surrogate pair (low, high) Assert.AreEqual("abc\ude51\uda2fxyz", penc.GetChars("abc+3lHaLw-xyz".AsBytes())); }
public void TestAscii() { // 'surrogatepass' is supported only for UTF-8, UTF-16LE, UTF-16BE, UTF-32LE, and UTF-32BE // nevertheless, it can be used with other encodings as long as there are no encoding errors Encoding penc = new PythonSurrogatePassEncoding(Encoding.ASCII); // clean ASCII Assert.AreEqual("abc", penc.GetChars("abc".AsBytes())); // Attempting to decode surrogates from ASCII will throw an exception. // Note that this is CPython 3.5 behaviour, CPython 3.4 will will blindly extract UTF-8 encoded surrogates from ASCII. // lone high surrogate in UTF-8 Assert.Throws <DecoderFallbackException>(() => penc.GetChars("\xed\xa0\x90".AsBytes())); // lone low surrogate in UTF-8 Assert.Throws <DecoderFallbackException>(() => penc.GetChars("\xed\xb0\x8a".AsBytes())); // invalid surrogate pair (low, high) in UTF-8 Assert.Throws <DecoderFallbackException>(() => penc.GetChars("\xed\xb9\x91\xed\xa8\xaf".AsBytes())); }
public void TestUtf8() { Encoding penc = new PythonSurrogatePassEncoding(Encoding.UTF8); // lone high surrogate Assert.AreEqual("abc\ud810xyz", penc.GetChars("abc\xed\xa0\x90xyz".AsBytes())); // lone low surrogate Assert.AreEqual("abc\udc0axyz", penc.GetChars("abc\xed\xb0\x8axyz".AsBytes())); // invalid surrogate pair (low, high) Assert.AreEqual("abc\ude51\uda2fxyz", penc.GetChars("abc\xed\xb9\x91\xed\xa8\xafxyz".AsBytes())); // valid surrogate pair (high, low) Assert.AreEqual("abc\uda2f\ude51xyz", penc.GetChars("abc\xed\xa8\xaf\xed\xb9\x91xyz".AsBytes())); var chars = new char[9]; // broken lone high surrogate var bytes = "abc\xed-\xa0\x90xyz".AsBytes(); Assert.That(() => penc.GetChars(bytes), Throws.TypeOf <DecoderFallbackException>() .With.Property("Index").EqualTo(3) .And.Property("BytesUnknown").One.EqualTo(0xed)); var dec = penc.GetDecoder(); Assert.That(dec.GetCharCount(bytes, 0, 4, flush: false), Is.EqualTo(3)); Assert.That(dec.GetChars(bytes, 0, 4, chars, 0, flush: false), Is.EqualTo(3)); Assert.That(() => dec.GetCharCount(bytes, 4, 4, flush: false), Throws.TypeOf <DecoderFallbackException>() .With.Property("Index").EqualTo(-1) .And.Property("BytesUnknown").One.EqualTo(0xed)); // broken in a different way bytes = "abc\xed\xa0-\x90xyz".AsBytes(); Assert.That(() => penc.GetChars(bytes), Throws.TypeOf <DecoderFallbackException>() .With.Property("Index").EqualTo(3) .And.Property("BytesUnknown").One.EqualTo(0xed)); dec.Reset(); Assert.That(dec.GetCharCount(bytes, 0, 4, flush: false), Is.EqualTo(3)); Assert.That(dec.GetChars(bytes, 0, 4, chars, 0, flush: false), Is.EqualTo(3)); Assert.That(() => dec.GetCharCount(bytes, 4, 4, flush: false), Throws.TypeOf <DecoderFallbackException>() .With.Property("Index").EqualTo(-1) .And.Property("BytesUnknown").One.EqualTo(0xed)); dec.Reset(); Assert.That(dec.GetCharCount(bytes, 0, 5, flush: false), Is.EqualTo(3)); Assert.That(dec.GetChars(bytes, 0, 5, chars, 0, flush: false), Is.EqualTo(3)); Assert.That(() => dec.GetCharCount(bytes, 5, 3, flush: false), Throws.TypeOf <DecoderFallbackException>() .With.Property("Index").EqualTo(-2) .And.Property("BytesUnknown").One.EqualTo(0xed)); // unfinished surrogate sequence in the middle bytes = "abc\xed\xa0xyz".AsBytes(); Assert.That(() => penc.GetChars(bytes), Throws.TypeOf <DecoderFallbackException>() .With.Property("Index").EqualTo(3) .And.Property("BytesUnknown").One.EqualTo(0xed)); dec.Reset(); Assert.That(dec.GetCharCount(bytes, 0, 5, flush: false), Is.EqualTo(3)); Assert.That(dec.GetChars(bytes, 0, 5, chars, 0, flush: false), Is.EqualTo(3)); Assert.That(() => dec.GetCharCount(bytes, 5, 2, flush: false), Throws.TypeOf <DecoderFallbackException>() .With.Property("Index").EqualTo(-2) .And.Property("BytesUnknown").One.EqualTo(0xed)); // unfinished surrogate sequence at the end bytes = "abcxyz\xed\xa0".AsBytes(); Assert.That(() => penc.GetChars(bytes), Throws.TypeOf <DecoderFallbackException>() .With.Property("Index").EqualTo(6) .And.Property("BytesUnknown").One.EqualTo(0xed)); dec.Reset(); Assert.That(dec.GetCharCount(bytes, 0, 7, flush: false), Is.EqualTo(6)); Assert.That(dec.GetChars(bytes, 0, 7, chars, 0, flush: false), Is.EqualTo(6)); Assert.That(() => dec.GetCharCount(bytes, 7, 1, flush: true), Throws.TypeOf <DecoderFallbackException>() .With.Property("Index").EqualTo(-1) .And.Property("BytesUnknown").One.EqualTo(0xed)); }