Esempio n. 1
0
        // Encrypt4 encrypts ptLen bytes from pt to ct using the expanded key from ks.
        // It processes 4 blocks of data in parallel (if the size of the input is not
        // divisible by 64, the remainder blocks are handled separately). The initial
        // counter is constructed from the given tag as required by AES-GCM-SIV.
        private static void Encrypt4(byte *pt, int ptLen, byte *ct, byte *tag, byte *ks)
        {
            if (ptLen == 0)
            {
                return;
            }

            int blocks         = Math.DivRem(ptLen, 16, out int remainder16);
            int remainder16Pos = ptLen - remainder16;
            int remainder4     = blocks % 4;
            int remainder4Pos  = blocks - remainder4;

            var orMask = Sse.StaticCast <uint, byte>(Sse2.SetVector128(0x80000000, 0, 0, 0));
            var ctr    = Sse2.Or(Sse2.LoadVector128(tag), orMask);

            var one = Sse2.SetVector128(0, 0, 0, 1);
            var two = Sse2.SetVector128(0, 0, 0, 2);

            for (int i = 0; i < remainder4Pos; i += 4)
            {
                var tmp0 = ctr;
                var tmp1 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), one));
                var tmp2 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), two));
                var tmp3 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(tmp2), one));
                ctr = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(tmp2), two));

                var key = Sse2.LoadVector128(ks);
                tmp0 = Sse2.Xor(tmp0, key);
                tmp1 = Sse2.Xor(tmp1, key);
                tmp2 = Sse2.Xor(tmp2, key);
                tmp3 = Sse2.Xor(tmp3, key);

                for (int j = 1; j < 14; ++j)
                {
                    key  = Sse2.LoadVector128(&ks[j * 16]);
                    tmp0 = Aes.Encrypt(tmp0, key);
                    tmp1 = Aes.Encrypt(tmp1, key);
                    tmp2 = Aes.Encrypt(tmp2, key);
                    tmp3 = Aes.Encrypt(tmp3, key);
                }

                key  = Sse2.LoadVector128(&ks[14 * 16]);
                tmp0 = Aes.EncryptLast(tmp0, key);
                tmp1 = Aes.EncryptLast(tmp1, key);
                tmp2 = Aes.EncryptLast(tmp2, key);
                tmp3 = Aes.EncryptLast(tmp3, key);

                tmp0 = Sse2.Xor(tmp0, Sse2.LoadVector128(&pt[(i + 0) * 16]));
                tmp1 = Sse2.Xor(tmp1, Sse2.LoadVector128(&pt[(i + 1) * 16]));
                tmp2 = Sse2.Xor(tmp2, Sse2.LoadVector128(&pt[(i + 2) * 16]));
                tmp3 = Sse2.Xor(tmp3, Sse2.LoadVector128(&pt[(i + 3) * 16]));

                Sse2.Store(&ct[(i + 0) * 16], tmp0);
                Sse2.Store(&ct[(i + 1) * 16], tmp1);
                Sse2.Store(&ct[(i + 2) * 16], tmp2);
                Sse2.Store(&ct[(i + 3) * 16], tmp3);
            }

            for (int i = 0; i < remainder4; ++i)
            {
                var tmp = ctr;
                ctr = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), one));
                tmp = Sse2.Xor(tmp, Sse2.LoadVector128(ks));

                for (int j = 1; j < 14; ++j)
                {
                    tmp = Aes.Encrypt(tmp, Sse2.LoadVector128(&ks[j * 16]));
                }

                tmp = Aes.EncryptLast(tmp, Sse2.LoadVector128(&ks[14 * 16]));
                tmp = Sse2.Xor(tmp, Sse2.LoadVector128(&pt[(remainder4Pos + i) * 16]));
                Sse2.Store(&ct[(remainder4Pos + i) * 16], tmp);
            }

            if (remainder16 != 0)
            {
                byte *b = stackalloc byte[16];
                new Span <byte>(pt + remainder16Pos, remainder16).CopyTo(new Span <byte>(b, 16));
                var tmp = Sse2.Xor(ctr, Sse2.LoadVector128(ks));

                for (int j = 1; j < 14; ++j)
                {
                    tmp = Aes.Encrypt(tmp, Sse2.LoadVector128(&ks[j * 16]));
                }

                tmp = Aes.EncryptLast(tmp, Sse2.LoadVector128(&ks[14 * 16]));
                Sse2.Store(b, Sse2.Xor(tmp, Sse2.LoadVector128(b)));

                new Span <byte>(b, remainder16).CopyTo(new Span <byte>(ct + remainder16Pos, remainder16));
            }
        }
Esempio n. 2
0
        // DecryptPowersTable decrypts ctLen bytes from ct and writes them to pt. While
        // decrypting, it updates the POLYVAL value in polyval. In order to decrypt and
        // update the POLYVAL value, it uses the expanded key from ks and the table of
        // powers in htbl. Decryption processes 6 blocks of data in parallel.
        private static void DecryptPowersTable(byte *ct, int ctLen, byte *pt, byte *polyval, byte *htbl, byte *tag, byte *ks)
        {
            Vector128 <ulong> sCtr1, sCtr2, sCtr3, sCtr4, sCtr5, sCtr6, tmp0, tmp1, tmp2, tmp3, tmp4, h;

            var poly = Sse.StaticCast <uint, ulong>(Sse2.SetVector128(0xc2000000, 0, 0, 1));
            var t    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(polyval));

            var orMask = Sse.StaticCast <uint, byte>(Sse2.SetVector128(0x80000000, 0, 0, 0));
            var ctr    = Sse2.Or(Sse2.LoadVector128(tag), orMask);

            var one = Sse2.SetVector128(0, 0, 0, 1);
            var two = Sse2.SetVector128(0, 0, 0, 2);

            int blocks = 0;

            if (ctLen >= 96)
            {
                var ctr1 = ctr;
                var ctr2 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), one));
                var ctr3 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), two));
                var ctr4 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), one));
                var ctr5 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), two));
                var ctr6 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), one));
                ctr = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), two));

                var key = Sse2.LoadVector128(ks);
                ctr1 = Sse2.Xor(ctr1, key);
                ctr2 = Sse2.Xor(ctr2, key);
                ctr3 = Sse2.Xor(ctr3, key);
                ctr4 = Sse2.Xor(ctr4, key);
                ctr5 = Sse2.Xor(ctr5, key);
                ctr6 = Sse2.Xor(ctr6, key);

                for (int i = 1; i < 14; ++i)
                {
                    key  = Sse2.LoadVector128(&ks[i * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);
                }

                key  = Sse2.LoadVector128(&ks[14 * 16]);
                ctr1 = Aes.EncryptLast(ctr1, key);
                ctr2 = Aes.EncryptLast(ctr2, key);
                ctr3 = Aes.EncryptLast(ctr3, key);
                ctr4 = Aes.EncryptLast(ctr4, key);
                ctr5 = Aes.EncryptLast(ctr5, key);
                ctr6 = Aes.EncryptLast(ctr6, key);

                ctr1 = Sse2.Xor(ctr1, Sse2.LoadVector128(&ct[0 * 16]));
                ctr2 = Sse2.Xor(ctr2, Sse2.LoadVector128(&ct[1 * 16]));
                ctr3 = Sse2.Xor(ctr3, Sse2.LoadVector128(&ct[2 * 16]));
                ctr4 = Sse2.Xor(ctr4, Sse2.LoadVector128(&ct[3 * 16]));
                ctr5 = Sse2.Xor(ctr5, Sse2.LoadVector128(&ct[4 * 16]));
                ctr6 = Sse2.Xor(ctr6, Sse2.LoadVector128(&ct[5 * 16]));

                Sse2.Store(&pt[0 * 16], ctr1);
                Sse2.Store(&pt[1 * 16], ctr2);
                Sse2.Store(&pt[2 * 16], ctr3);
                Sse2.Store(&pt[3 * 16], ctr4);
                Sse2.Store(&pt[4 * 16], ctr5);
                Sse2.Store(&pt[5 * 16], ctr6);

                ctLen  -= 96;
                blocks += 6;

                while (ctLen >= 96)
                {
                    sCtr6 = Sse.StaticCast <byte, ulong>(ctr6);
                    sCtr5 = Sse.StaticCast <byte, ulong>(ctr5);
                    sCtr4 = Sse.StaticCast <byte, ulong>(ctr4);
                    sCtr3 = Sse.StaticCast <byte, ulong>(ctr3);
                    sCtr2 = Sse.StaticCast <byte, ulong>(ctr2);
                    sCtr1 = Sse.StaticCast <byte, ulong>(ctr1);

                    ctr1 = ctr;
                    ctr2 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), one));
                    ctr3 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), two));
                    ctr4 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), one));
                    ctr5 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), two));
                    ctr6 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), one));
                    ctr  = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), two));

                    key  = Sse2.LoadVector128(ks);
                    ctr1 = Sse2.Xor(ctr1, key);
                    ctr2 = Sse2.Xor(ctr2, key);
                    ctr3 = Sse2.Xor(ctr3, key);
                    ctr4 = Sse2.Xor(ctr4, key);
                    ctr5 = Sse2.Xor(ctr5, key);
                    ctr6 = Sse2.Xor(ctr6, key);

                    tmp3 = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl));
                    tmp1 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x11);
                    tmp2 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x00);
                    tmp0 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x01);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x10);
                    tmp0 = Sse2.Xor(tmp3, tmp0);

                    key  = Sse2.LoadVector128(&ks[1 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16]));
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x10);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x00);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x01);
                    tmp0 = Sse2.Xor(tmp0, tmp3);

                    key  = Sse2.LoadVector128(&ks[2 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16]));
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x10);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x00);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x01);
                    tmp0 = Sse2.Xor(tmp0, tmp3);

                    key  = Sse2.LoadVector128(&ks[3 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16]));
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x10);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x00);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x01);
                    tmp0 = Sse2.Xor(tmp0, tmp3);

                    key  = Sse2.LoadVector128(&ks[4 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16]));
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x10);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x00);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x01);
                    tmp0 = Sse2.Xor(tmp0, tmp3);

                    key  = Sse2.LoadVector128(&ks[5 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    key  = Sse2.LoadVector128(&ks[6 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    key  = Sse2.LoadVector128(&ks[7 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    sCtr1 = Sse2.Xor(t, sCtr1);
                    tmp4  = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16]));
                    tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x01);
                    tmp0  = Sse2.Xor(tmp3, tmp0);
                    tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x11);
                    tmp1  = Sse2.Xor(tmp3, tmp1);
                    tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x00);
                    tmp2  = Sse2.Xor(tmp3, tmp2);
                    tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x10);
                    tmp0  = Sse2.Xor(tmp3, tmp0);

                    key  = Sse2.LoadVector128(&ks[8 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    tmp3 = Sse2.ShiftRightLogical128BitLane(tmp0, 8);
                    tmp4 = Sse2.Xor(tmp3, tmp1);
                    tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp0, 8);
                    t    = Sse2.Xor(tmp3, tmp2);

                    key  = Sse2.LoadVector128(&ks[9 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                    t    = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                    t    = Sse2.Xor(tmp1, t);

                    key  = Sse2.LoadVector128(&ks[10 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    key  = Sse2.LoadVector128(&ks[11 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    key  = Sse2.LoadVector128(&ks[12 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    key  = Sse2.LoadVector128(&ks[13 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    key  = Sse2.LoadVector128(&ks[14 * 16]);
                    ctr1 = Aes.EncryptLast(ctr1, key);
                    ctr2 = Aes.EncryptLast(ctr2, key);
                    ctr3 = Aes.EncryptLast(ctr3, key);
                    ctr4 = Aes.EncryptLast(ctr4, key);
                    ctr5 = Aes.EncryptLast(ctr5, key);
                    ctr6 = Aes.EncryptLast(ctr6, key);

                    ctr1 = Sse2.Xor(ctr1, Sse2.LoadVector128(&ct[(blocks + 0) * 16]));
                    ctr2 = Sse2.Xor(ctr2, Sse2.LoadVector128(&ct[(blocks + 1) * 16]));
                    ctr3 = Sse2.Xor(ctr3, Sse2.LoadVector128(&ct[(blocks + 2) * 16]));
                    ctr4 = Sse2.Xor(ctr4, Sse2.LoadVector128(&ct[(blocks + 3) * 16]));
                    ctr5 = Sse2.Xor(ctr5, Sse2.LoadVector128(&ct[(blocks + 4) * 16]));
                    ctr6 = Sse2.Xor(ctr6, Sse2.LoadVector128(&ct[(blocks + 5) * 16]));

                    tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                    t    = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                    t    = Sse2.Xor(tmp1, t);
                    t    = Sse2.Xor(tmp4, t);

                    Sse2.Store(&pt[(blocks + 0) * 16], ctr1);
                    Sse2.Store(&pt[(blocks + 1) * 16], ctr2);
                    Sse2.Store(&pt[(blocks + 2) * 16], ctr3);
                    Sse2.Store(&pt[(blocks + 3) * 16], ctr4);
                    Sse2.Store(&pt[(blocks + 4) * 16], ctr5);
                    Sse2.Store(&pt[(blocks + 5) * 16], ctr6);

                    ctLen  -= 96;
                    blocks += 6;
                }

                sCtr6 = Sse.StaticCast <byte, ulong>(ctr6);
                sCtr5 = Sse.StaticCast <byte, ulong>(ctr5);
                sCtr4 = Sse.StaticCast <byte, ulong>(ctr4);
                sCtr3 = Sse.StaticCast <byte, ulong>(ctr3);
                sCtr2 = Sse.StaticCast <byte, ulong>(ctr2);
                sCtr1 = Sse.StaticCast <byte, ulong>(ctr1);

                tmp3 = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl));
                tmp0 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x10);
                tmp1 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x11);
                tmp2 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x00);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x01);
                tmp0 = Sse2.Xor(tmp3, tmp0);

                h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16]));
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x10);
                tmp0 = Sse2.Xor(tmp0, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x11);
                tmp1 = Sse2.Xor(tmp1, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x00);
                tmp2 = Sse2.Xor(tmp2, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x01);
                tmp0 = Sse2.Xor(tmp0, tmp3);

                h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16]));
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x10);
                tmp0 = Sse2.Xor(tmp0, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x11);
                tmp1 = Sse2.Xor(tmp1, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x00);
                tmp2 = Sse2.Xor(tmp2, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x01);
                tmp0 = Sse2.Xor(tmp0, tmp3);

                h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16]));
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x10);
                tmp0 = Sse2.Xor(tmp0, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x11);
                tmp1 = Sse2.Xor(tmp1, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x00);
                tmp2 = Sse2.Xor(tmp2, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x01);
                tmp0 = Sse2.Xor(tmp0, tmp3);

                h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16]));
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x10);
                tmp0 = Sse2.Xor(tmp0, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x11);
                tmp1 = Sse2.Xor(tmp1, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x00);
                tmp2 = Sse2.Xor(tmp2, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x01);
                tmp0 = Sse2.Xor(tmp0, tmp3);

                sCtr1 = Sse2.Xor(t, sCtr1);
                tmp4  = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16]));
                tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x11);
                tmp1  = Sse2.Xor(tmp3, tmp1);
                tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x00);
                tmp2  = Sse2.Xor(tmp3, tmp2);
                tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x10);
                tmp0  = Sse2.Xor(tmp3, tmp0);
                tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x01);
                tmp0  = Sse2.Xor(tmp3, tmp0);

                tmp3 = Sse2.ShiftRightLogical128BitLane(tmp0, 8);
                tmp4 = Sse2.Xor(tmp3, tmp1);
                tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp0, 8);
                t    = Sse2.Xor(tmp3, tmp2);

                tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                t    = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                t    = Sse2.Xor(tmp1, t);
                tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                t    = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                t    = Sse2.Xor(tmp1, t);
                t    = Sse2.Xor(tmp4, t);
            }

            h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl));

            while (ctLen >= 16)
            {
                var tmp = ctr;
                ctr = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), one));
                tmp = Sse2.Xor(tmp, Sse2.LoadVector128(ks));

                for (int i = 1; i < 14; ++i)
                {
                    tmp = Aes.Encrypt(tmp, Sse2.LoadVector128(&ks[i * 16]));
                }

                tmp = Aes.EncryptLast(tmp, Sse2.LoadVector128(&ks[14 * 16]));
                tmp = Sse2.Xor(tmp, Sse2.LoadVector128(&ct[blocks * 16]));
                Sse2.Store(&pt[blocks * 16], tmp);

                t    = Sse2.Xor(Sse.StaticCast <byte, ulong>(tmp), t);
                tmp1 = Pclmulqdq.CarrylessMultiply(t, h, 0x00);
                tmp4 = Pclmulqdq.CarrylessMultiply(t, h, 0x11);
                tmp2 = Pclmulqdq.CarrylessMultiply(t, h, 0x10);
                tmp3 = Pclmulqdq.CarrylessMultiply(t, h, 0x01);
                tmp2 = Sse2.Xor(tmp3, tmp2);
                tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                tmp2 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                tmp1 = Sse2.Xor(tmp1, tmp3);
                tmp4 = Sse2.Xor(tmp2, tmp4);

                tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10);
                tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78));
                tmp1 = Sse2.Xor(tmp2, tmp3);
                tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10);
                tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78));
                tmp1 = Sse2.Xor(tmp2, tmp3);
                t    = Sse2.Xor(tmp1, tmp4);

                ctLen -= 16;
                ++blocks;
            }

            if (ctLen > 0)
            {
                byte *b = stackalloc byte[16];
                new Span <byte>(ct + blocks * 16, ctLen).CopyTo(new Span <byte>(b, 16));
                var tmp = Sse2.Xor(ctr, Sse2.LoadVector128(ks));

                for (int i = 1; i < 14; ++i)
                {
                    tmp = Aes.Encrypt(tmp, Sse2.LoadVector128(&ks[i * 16]));
                }

                tmp = Aes.EncryptLast(tmp, Sse2.LoadVector128(&ks[14 * 16]));
                tmp = Sse2.Xor(tmp, Sse2.LoadVector128(b));
                Sse2.Store(b, tmp);

                new Span <byte>(b, ctLen).CopyTo(new Span <byte>(&pt[blocks * 16], ctLen));
                new Span <byte>(b + ctLen, 16 - ctLen).Clear();

                t    = Sse2.Xor(Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(b)), t);
                tmp1 = Pclmulqdq.CarrylessMultiply(t, h, 0x00);
                tmp4 = Pclmulqdq.CarrylessMultiply(t, h, 0x11);
                tmp2 = Pclmulqdq.CarrylessMultiply(t, h, 0x10);
                tmp3 = Pclmulqdq.CarrylessMultiply(t, h, 0x01);
                tmp2 = Sse2.Xor(tmp3, tmp2);
                tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                tmp2 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                tmp1 = Sse2.Xor(tmp1, tmp3);
                tmp4 = Sse2.Xor(tmp2, tmp4);

                tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10);
                tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78));
                tmp1 = Sse2.Xor(tmp2, tmp3);
                tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10);
                tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78));
                tmp1 = Sse2.Xor(tmp2, tmp3);
                t    = Sse2.Xor(tmp1, tmp4);
            }

            Sse2.Store(polyval, Sse.StaticCast <ulong, byte>(t));
        }