Example #1
0
            public void TestIncrementalWithUtf32()
            {
                var      bytes = new byte[] { 0xd8, 0xd9, 0x00, 0x00, 0xda, 0xdb, 0x00, 0x00, 0xdc, 0xdd, 0x00, 0x00, 0xde, 0xdf, 0x00, 0x00 };
                Encoding penc  = new PythonSurrogatePassEncoding(Encoding.UTF32);

                SurrogateTestHelpers.IncrementalTest(penc, bytes, roundTrip: false);
            }
Example #2
0
            public void TestIncrementalWithUtf8()
            {
                // In UTF-8: Lone high surrogate (invalid), surrogate pair: high-low (valid), lone low surrogate (invalid)
                var      bytes = new byte[] { 0xed, 0xa7, 0x98, 0xed, 0xaf, 0x9a, 0xed, 0xb7, 0x9c, 0xed, 0xbf, 0x9e };
                Encoding penc  = new PythonSurrogatePassEncoding(Encoding.UTF8);

                SurrogateTestHelpers.IncrementalTest(penc, bytes, roundTrip: false);
            }
Example #3
0
            public void TestIncrementalWithtUtf16()
            {
                // In UTF-16LE: lone low surrogate (invalid) Lone high surrogate (invalid), surrogate pair: high-low (valid),
                var      bytes = new byte[] { 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf };
                Encoding penc  = new PythonSurrogatePassEncoding(Encoding.Unicode);

                SurrogateTestHelpers.IncrementalTest(penc, bytes, roundTrip: false);
            }
Example #4
0
            public void TestUtf16BE()
            {
                Encoding penc = new PythonSurrogatePassEncoding(Encoding.BigEndianUnicode);

                // lone high surrogate
                Assert.AreEqual("\ud810", penc.GetChars("\xd8\x10".AsBytes()));

                // lone low surrogate
                Assert.AreEqual("\udc0a", penc.GetChars("\xdc\n".AsBytes()));

                // invalid surrogate pair (low, high)
                Assert.AreEqual("\ude51\uda2f", penc.GetChars("\xdeQ\xda/".AsBytes()));
            }
Example #5
0
            public void TestUtf32BE()
            {
                Encoding penc = new PythonSurrogatePassEncoding(new UTF32Encoding(bigEndian: true, byteOrderMark: false));

                // lone high surrogate
                Assert.AreEqual("\ud810", penc.GetChars("\x00\x00\xd8\x10".AsBytes()));

                // lone low surrogate
                Assert.AreEqual("\udc0a", penc.GetChars("\x00\x00\xdc\n".AsBytes()));

                // invalid surrogate pair (low, high)
                Assert.AreEqual("\ude51\uda2f", penc.GetChars("\x00\x00\xdeQ\x00\x00\xda/".AsBytes()));
            }
Example #6
0
            public void TestUtf32LE()
            {
                Encoding penc = new PythonSurrogatePassEncoding(new UTF32Encoding(bigEndian: false, byteOrderMark: false));

                // lone high surrogate
                Assert.AreEqual("\x10\xd8\x00\x00".AsBytes(), penc.GetBytes("\ud810"));

                // lone low surrogate
                Assert.AreEqual("\n\xdc\x00\x00".AsBytes(), penc.GetBytes("\udc0a"));

                // invalid surrogate pair (low, high)
                Assert.AreEqual("Q\xde\x00\x00/\xda\x00\x00".AsBytes(), penc.GetBytes("\ude51\uda2f"));
            }
Example #7
0
            public void TestUtf16LE()
            {
                Encoding penc = new PythonSurrogatePassEncoding(Encoding.Unicode);

                // lone high surrogate
                Assert.AreEqual("\ud810", penc.GetChars("\x10\xd8".AsBytes()));

                // lone low surrogate
                Assert.AreEqual("\udc0a", penc.GetChars("\n\xdc".AsBytes()));

                // invalid surrogate pair (low, high)
                Assert.AreEqual("\ude51\uda2f", penc.GetChars("Q\xde/\xda".AsBytes()));
            }
Example #8
0
            public void TestUtf8()
            {
                Encoding penc = new PythonSurrogatePassEncoding(Encoding.UTF8);

                // lone high surrogate
                Assert.AreEqual("abc\xed\xa0\x90xyz".AsBytes(), penc.GetBytes("abc\ud810xyz"));

                // lone low surrogate
                Assert.AreEqual("abc\xed\xb0\x8axyz".AsBytes(), penc.GetBytes("abc\udc0axyz"));

                // invalid surrogate pair (low, high)
                Assert.AreEqual("abc\xed\xb9\x91\xed\xa8\xafxyz".AsBytes(), penc.GetBytes("abc\ude51\uda2fxyz"));
            }
Example #9
0
            public void TestUtf7()
            {
                // "surrogatepass" is not supported for UTF-7 per se,
                // but UTF-7 is supposed to decode any surogate characters from its ASCII mangled form
                // without requiring any fallback support
                Encoding penc = new PythonSurrogatePassEncoding(new UTF7Encoding(allowOptionals: true));

                // lone high surrogate
                Assert.AreEqual("abc\ud810xyz", penc.GetChars("abc+2BA-xyz".AsBytes()));

                // lone low surrogate
                Assert.AreEqual("abc\udc0axyz", penc.GetChars("abc+3Ao-xyz".AsBytes()));

                // invalid surrogate pair (low, high)
                Assert.AreEqual("abc\ude51\uda2fxyz", penc.GetChars("abc+3lHaLw-xyz".AsBytes()));
            }
Example #10
0
            public void TestAscii()
            {
                // 'surrogatepass' is supported only for UTF-8, UTF-16LE, UTF-16BE, UTF-32LE, and UTF-32BE
                // nevertheless, it can be used with other encodings as long as there are no encoding errors
                Encoding penc = new PythonSurrogatePassEncoding(Encoding.ASCII);

                // clean ASCII
                Assert.AreEqual("abc", penc.GetChars("abc".AsBytes()));

                // Attempting to decode surrogates from ASCII will throw an exception.
                // Note that this is CPython 3.5 behaviour, CPython 3.4 will will blindly extract UTF-8 encoded surrogates from ASCII.

                // lone high surrogate in UTF-8
                Assert.Throws <DecoderFallbackException>(() => penc.GetChars("\xed\xa0\x90".AsBytes()));

                // lone low surrogate in UTF-8
                Assert.Throws <DecoderFallbackException>(() => penc.GetChars("\xed\xb0\x8a".AsBytes()));

                // invalid surrogate pair (low, high) in UTF-8
                Assert.Throws <DecoderFallbackException>(() => penc.GetChars("\xed\xb9\x91\xed\xa8\xaf".AsBytes()));
            }
Example #11
0
            public void TestAscii()
            {
                // 'surrogatepass' is supported only for UTF-8, UTF-16LE, UTF-16BE, UTF-32LE, and UTF-32BE
                // nevertheless, it can be used with other encodings as long as there are no encoding errors
                Encoding penc = new PythonSurrogatePassEncoding(Encoding.ASCII);

                // clean ASCII
                Assert.AreEqual("abc".AsBytes(), penc.GetBytes("abc"));

                // Attempting to encode surrogates to ASCII will throw an exception.
                // Note that this is CPython 3.5 behaviour, CPython 3.4 will happily contaminate ASCII with UTF-8 encoded surrogates.

                // lone high surrogate
                Assert.Throws <EncoderFallbackException>(() => penc.GetBytes("\ud810"));

                // lone low surrogate
                Assert.Throws <EncoderFallbackException>(() => penc.GetBytes("\udc0a"));

                // invalid surrogate pair (low, high)
                Assert.Throws <EncoderFallbackException>(() => penc.GetBytes("\ude51\uda2f"));
            }
Example #12
0
            public void TestUtf8()
            {
                Encoding penc = new PythonSurrogatePassEncoding(Encoding.UTF8);

                // lone high surrogate
                Assert.AreEqual("abc\ud810xyz", penc.GetChars("abc\xed\xa0\x90xyz".AsBytes()));

                // lone low surrogate
                Assert.AreEqual("abc\udc0axyz", penc.GetChars("abc\xed\xb0\x8axyz".AsBytes()));

                // invalid surrogate pair (low, high)
                Assert.AreEqual("abc\ude51\uda2fxyz", penc.GetChars("abc\xed\xb9\x91\xed\xa8\xafxyz".AsBytes()));

                // valid surrogate pair (high, low)
                Assert.AreEqual("abc\uda2f\ude51xyz", penc.GetChars("abc\xed\xa8\xaf\xed\xb9\x91xyz".AsBytes()));

                var chars = new char[9];

                // broken lone high surrogate
                var bytes = "abc\xed-\xa0\x90xyz".AsBytes();

                Assert.That(() => penc.GetChars(bytes),
                            Throws.TypeOf <DecoderFallbackException>()
                            .With.Property("Index").EqualTo(3)
                            .And.Property("BytesUnknown").One.EqualTo(0xed));

                var dec = penc.GetDecoder();

                Assert.That(dec.GetCharCount(bytes, 0, 4, flush: false), Is.EqualTo(3));
                Assert.That(dec.GetChars(bytes, 0, 4, chars, 0, flush: false), Is.EqualTo(3));
                Assert.That(() => dec.GetCharCount(bytes, 4, 4, flush: false),
                            Throws.TypeOf <DecoderFallbackException>()
                            .With.Property("Index").EqualTo(-1)
                            .And.Property("BytesUnknown").One.EqualTo(0xed));

                // broken in a different way
                bytes = "abc\xed\xa0-\x90xyz".AsBytes();
                Assert.That(() => penc.GetChars(bytes),
                            Throws.TypeOf <DecoderFallbackException>()
                            .With.Property("Index").EqualTo(3)
                            .And.Property("BytesUnknown").One.EqualTo(0xed));

                dec.Reset();
                Assert.That(dec.GetCharCount(bytes, 0, 4, flush: false), Is.EqualTo(3));
                Assert.That(dec.GetChars(bytes, 0, 4, chars, 0, flush: false), Is.EqualTo(3));
                Assert.That(() => dec.GetCharCount(bytes, 4, 4, flush: false),
                            Throws.TypeOf <DecoderFallbackException>()
                            .With.Property("Index").EqualTo(-1)
                            .And.Property("BytesUnknown").One.EqualTo(0xed));

                dec.Reset();
                Assert.That(dec.GetCharCount(bytes, 0, 5, flush: false), Is.EqualTo(3));
                Assert.That(dec.GetChars(bytes, 0, 5, chars, 0, flush: false), Is.EqualTo(3));
                Assert.That(() => dec.GetCharCount(bytes, 5, 3, flush: false),
                            Throws.TypeOf <DecoderFallbackException>()
                            .With.Property("Index").EqualTo(-2)
                            .And.Property("BytesUnknown").One.EqualTo(0xed));

                // unfinished surrogate sequence in the middle
                bytes = "abc\xed\xa0xyz".AsBytes();
                Assert.That(() => penc.GetChars(bytes),
                            Throws.TypeOf <DecoderFallbackException>()
                            .With.Property("Index").EqualTo(3)
                            .And.Property("BytesUnknown").One.EqualTo(0xed));

                dec.Reset();
                Assert.That(dec.GetCharCount(bytes, 0, 5, flush: false), Is.EqualTo(3));
                Assert.That(dec.GetChars(bytes, 0, 5, chars, 0, flush: false), Is.EqualTo(3));
                Assert.That(() => dec.GetCharCount(bytes, 5, 2, flush: false),
                            Throws.TypeOf <DecoderFallbackException>()
                            .With.Property("Index").EqualTo(-2)
                            .And.Property("BytesUnknown").One.EqualTo(0xed));

                // unfinished surrogate sequence at the end
                bytes = "abcxyz\xed\xa0".AsBytes();
                Assert.That(() => penc.GetChars(bytes),
                            Throws.TypeOf <DecoderFallbackException>()
                            .With.Property("Index").EqualTo(6)
                            .And.Property("BytesUnknown").One.EqualTo(0xed));

                dec.Reset();
                Assert.That(dec.GetCharCount(bytes, 0, 7, flush: false), Is.EqualTo(6));
                Assert.That(dec.GetChars(bytes, 0, 7, chars, 0, flush: false), Is.EqualTo(6));
                Assert.That(() => dec.GetCharCount(bytes, 7, 1, flush: true),
                            Throws.TypeOf <DecoderFallbackException>()
                            .With.Property("Index").EqualTo(-1)
                            .And.Property("BytesUnknown").One.EqualTo(0xed));
            }