예제 #1
0
        public void RunLclVarScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_Load));

            var left   = Sse2.LoadVector128((Int16 *)(_dataTable.inArray1Ptr));
            var right  = Sse2.LoadVector128((Int16 *)(_dataTable.inArray2Ptr));
            var result = Ssse3.AlignRight(left, right, 2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(left, right, _dataTable.outArrayPtr);
        }
예제 #2
0
        public void RunLclVarScenario_UnsafeRead()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead));

            var left   = Unsafe.Read <Vector128 <Int64> >(_dataTable.inArray1Ptr);
            var right  = Unsafe.Read <Vector128 <Int64> >(_dataTable.inArray2Ptr);
            var result = Ssse3.AlignRight(left, right, 8);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(left, right, _dataTable.outArrayPtr);
        }
예제 #3
0
        public void RunClsVarScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario));

            var result = Ssse3.AlignRight(
                _clsVar1,
                _clsVar2,
                0
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr);
        }
예제 #4
0
        public void RunBasicScenario_LoadAligned()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_LoadAligned));

            var result = Ssse3.AlignRight(
                Sse2.LoadAlignedVector128((UInt16 *)(_dataTable.inArray1Ptr)),
                Sse2.LoadAlignedVector128((UInt16 *)(_dataTable.inArray2Ptr)),
                0
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }
예제 #5
0
        public void RunBasicScenario_UnsafeRead()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead));

            var result = Ssse3.AlignRight(
                Unsafe.Read <Vector128 <UInt16> >(_dataTable.inArray1Ptr),
                Unsafe.Read <Vector128 <UInt16> >(_dataTable.inArray2Ptr),
                0
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }
예제 #6
0
        private static void diagonalize(ref Vector128 <ulong> row1l, ref Vector128 <ulong> row2l, ref Vector128 <ulong> row3l, ref Vector128 <ulong> row4l,
                                        ref Vector128 <ulong> row1h, ref Vector128 <ulong> row2h, ref Vector128 <ulong> row3h, ref Vector128 <ulong> row4h, ref Vector128 <ulong> b0)
        {
            var t0 = Ssse3.AlignRight(row2h.AsSByte(), row2l.AsSByte(), 8);
            var t1 = Ssse3.AlignRight(row2l.AsSByte(), row2h.AsSByte(), 8);

            row2l = t0.AsUInt64();
            row2h = t1.AsUInt64();

            b0    = row3l;
            row3l = row3h;
            row3h = b0;

            t0    = Ssse3.AlignRight(row4h.AsSByte(), row4l.AsSByte(), 8);
            t1    = Ssse3.AlignRight(row4l.AsSByte(), row4h.AsSByte(), 8);
            row4l = t1.AsUInt64();
            row4h = t0.AsUInt64();
        }
예제 #7
0
        // PolyvalPowersTable updates the POLYVAL value in polyval to include length bytes
        // of data from input, given the POLYVAL key in hashKey. It uses the precomputed
        // powers of the key given in htbl. If the length is not divisible by 16, input
        // is padded with zeros until it's a multiple of 16 bytes.
        private static void PolyvalPowersTable(byte *polyval, byte *htbl, byte *input, int length)
        {
            if (length == 0)
            {
                return;
            }

            int blocks = Math.DivRem(length, 16, out int remainder16);
            int remainder128 = length % 128 - remainder16;
            Vector128 <ulong> tmp0, tmp1, tmp2, tmp3, tmp4;

            var xhi  = Sse2.SetZeroVector128 <ulong>();
            var poly = Sse.StaticCast <uint, ulong>(Sse2.SetVector128(0xc2000000, 0, 0, 1));
            var t    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(polyval));

            if (remainder128 != 0)
            {
                int remainder128Blocks = remainder128 / 16;
                blocks -= remainder128Blocks;

                var data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(input)));
                var h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[(remainder128Blocks - 1) * 16]));

                tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                tmp2 = Sse2.Xor(tmp2, tmp3);

                for (int i = 1; i < remainder128Blocks; ++i)
                {
                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&input[i * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[(remainder128Blocks - i - 1) * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                }

                tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                xhi  = Sse2.Xor(tmp3, tmp1);
                t    = Sse2.Xor(tmp0, tmp2);
            }

            if (blocks != 0)
            {
                var fixedInput = input + remainder128;

                if (remainder128 == 0)
                {
                    var data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[7 * 16]));
                    var h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[0 * 16]));

                    tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[6 * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[5 * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[4 * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[3 * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16]));
                    tmp4 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[2 * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[1 * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[6 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[0 * 16])));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[7 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                    tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                    xhi  = Sse2.Xor(tmp3, tmp1);
                    t    = Sse2.Xor(tmp0, tmp2);
                }

                for (int i = remainder128 == 0 ? 8 : 0; i < blocks; i += 8)
                {
                    var data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 7) * 16]));
                    var h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[0 * 16]));

                    tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 6) * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 5) * 16]));
                    tmp4 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                    t    = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    t    = Sse2.Xor(t, tmp4);
                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 4) * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 3) * 16]));
                    tmp4 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                    t    = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    t    = Sse2.Xor(t, tmp4);
                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 2) * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    t    = Sse2.Xor(t, xhi);
                    data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 1) * 16]));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[6 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[i * 16])));
                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[7 * 16]));

                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                    tmp2 = Sse2.Xor(tmp2, tmp3);

                    tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                    tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                    xhi  = Sse2.Xor(tmp3, tmp1);
                    t    = Sse2.Xor(tmp0, tmp2);
                }
            }

            if (blocks != 0 || remainder128 != 0)
            {
                tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                t    = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                t    = Sse2.Xor(tmp3, t);
                tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                t    = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                t    = Sse2.Xor(tmp3, t);
                t    = Sse2.Xor(xhi, t);
            }

            if (remainder16 != 0)
            {
                byte *b = stackalloc byte[16];
                new Span <byte>(input + length - remainder16, remainder16).CopyTo(new Span <byte>(b, 16));

                var data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(b)));
                var h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl));

                tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01);
                tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00);
                tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11);
                tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10);
                tmp2 = Sse2.Xor(tmp2, tmp3);
                tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                xhi  = Sse2.Xor(tmp3, tmp1);
                t    = Sse2.Xor(tmp0, tmp2);

                tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                t    = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                t    = Sse2.Xor(tmp3, t);
                tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                t    = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                t    = Sse2.Xor(tmp3, t);
                t    = Sse2.Xor(xhi, t);
            }

            Sse2.Store(polyval, Sse.StaticCast <ulong, byte>(t));
        }
예제 #8
0
        static unsafe int Main(string[] args)
        {
            int testResult = Pass;

            if (Ssse3.IsSupported)
            {
                using (TestTable <sbyte> sbyteTable = new TestTable <sbyte>(new sbyte[16] {
                    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
                }, new sbyte[16] {
                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
                }, new sbyte[16]))
                {
                    var vf1 = Unsafe.Read <Vector128 <sbyte> >(sbyteTable.inArray1Ptr);
                    var vf2 = Unsafe.Read <Vector128 <sbyte> >(sbyteTable.inArray2Ptr);

                    var vf3 = Ssse3.AlignRight(vf1, vf2, 27);
                    Unsafe.Write(sbyteTable.outArrayPtr, vf3);

                    if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 27) && (z[01] == 28) && (z[02] == 29) && (z[03] == 30) &&
                                                (z[04] == 31) && (z[05] == 00) && (z[06] == 00) && (z[07] == 00) &&
                                                (z[08] == 00) && (z[09] == 00) && (z[10] == 00) && (z[11] == 00) &&
                                                (z[12] == 00) && (z[13] == 00) && (z[14] == 00) && (z[15] == 00)))
                    {
                        Console.WriteLine("SSE AlignRight failed on sbyte:");
                        foreach (var item in sbyteTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf3 = Ssse3.AlignRight(vf1, vf2, 5);
                    Unsafe.Write(sbyteTable.outArrayPtr, vf3);

                    if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 05) && (z[01] == 06) && (z[02] == 07) && (z[03] == 08) &&
                                                (z[04] == 09) && (z[05] == 10) && (z[06] == 11) && (z[07] == 12) &&
                                                (z[08] == 13) && (z[09] == 14) && (z[10] == 15) && (z[11] == 16) &&
                                                (z[12] == 17) && (z[13] == 18) && (z[14] == 19) && (z[15] == 20)))
                    {
                        Console.WriteLine("SSE AlignRight failed on sbyte:");
                        foreach (var item in sbyteTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf3 = Ssse3.AlignRight(vf1, vf2, 250);
                    Unsafe.Write(sbyteTable.outArrayPtr, vf3);

                    if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 00) && (z[01] == 00) && (z[02] == 00) && (z[03] == 00) &&
                                                (z[04] == 00) && (z[05] == 00) && (z[06] == 00) && (z[07] == 00) &&
                                                (z[08] == 00) && (z[09] == 00) && (z[10] == 00) && (z[11] == 00) &&
                                                (z[12] == 00) && (z[13] == 00) && (z[14] == 00) && (z[15] == 00)))
                    {
                        Console.WriteLine("SSE AlignRight failed on sbyte:");
                        foreach (var item in sbyteTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf3 = Ssse3.AlignRight(vf1, vf2, 228);
                    Unsafe.Write(sbyteTable.outArrayPtr, vf3);

                    if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 00) && (z[01] == 00) && (z[02] == 00) && (z[03] == 00) &&
                                                (z[04] == 00) && (z[05] == 00) && (z[06] == 00) && (z[07] == 00) &&
                                                (z[08] == 00) && (z[09] == 00) && (z[10] == 00) && (z[11] == 00) &&
                                                (z[12] == 00) && (z[13] == 00) && (z[14] == 00) && (z[15] == 00)))
                    {
                        Console.WriteLine("SSE AlignRight failed on sbyte:");
                        foreach (var item in sbyteTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf3 = (Vector128 <sbyte>) typeof(Ssse3).GetMethod(nameof(Ssse3.AlignRight), new Type[] { vf1.GetType(), vf2.GetType(), typeof(byte) }).Invoke(null, new object[] { vf1, vf2, (byte)(27) });
                    Unsafe.Write(sbyteTable.outArrayPtr, vf3);

                    if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 27) && (z[01] == 28) && (z[02] == 29) && (z[03] == 30) &&
                                                (z[04] == 31) && (z[05] == 00) && (z[06] == 00) && (z[07] == 00) &&
                                                (z[08] == 00) && (z[09] == 00) && (z[10] == 00) && (z[11] == 00) &&
                                                (z[12] == 00) && (z[13] == 00) && (z[14] == 00) && (z[15] == 00)))
                    {
                        Console.WriteLine("SSE AlignRight failed on sbyte:");
                        foreach (var item in sbyteTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }
                }
            }

            return(testResult);
        }
예제 #9
0
        //
        // NOTE(casey): Single block version
        //
        public static unsafe Vector128 <byte> Hash(ReadOnlySpan <byte> Seed128Init, ReadOnlySpan <byte> SourceInit)
        {
            Vector128 <byte> xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;       // NOTE(casey): xmm0-xmm7 are the hash accumulation lanes
            Vector128 <byte> xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; // NOTE(casey): xmm8-xmm15 hold values to be appended (residual, length)

            int Len = SourceInit.Length;

            fixed(byte *sourceInitPtr = SourceInit)
            fixed(byte *seedInitPtr = Seed128Init)
            {
                byte *rax = sourceInitPtr;
                byte *rcx = seedInitPtr;

                //
                // NOTE(casey): Seed the eight hash registers
                //

                xmm0 = Sse2.LoadVector128(rcx + 0x00);
                xmm1 = Sse2.LoadVector128(rcx + 0x10);
                xmm2 = Sse2.LoadVector128(rcx + 0x20);
                xmm3 = Sse2.LoadVector128(rcx + 0x30);

                xmm4 = Sse2.LoadVector128(rcx + 0x40);
                xmm5 = Sse2.LoadVector128(rcx + 0x50);
                xmm6 = Sse2.LoadVector128(rcx + 0x60);
                xmm7 = Sse2.LoadVector128(rcx + 0x70);

                // MEOW_DUMP_STATE("Seed", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0);

                //
                // NOTE(casey): Hash all full 256-byte blocks
                //

                int BlockCount = (SourceInit.Length >> 8);

                if (BlockCount > MEOW_PREFETCH_LIMIT)
                {
                    // NOTE(casey): For large input, modern Intel x64's can't hit full speed without prefetching, so we use this loop
                    while (BlockCount-- > 0)
                    {
                        Sse.Prefetch0(rax + MEOW_PREFETCH + 0x00);
                        Sse.Prefetch0(rax + MEOW_PREFETCH + 0x40);
                        Sse.Prefetch0(rax + MEOW_PREFETCH + 0x80);
                        Sse.Prefetch0(rax + MEOW_PREFETCH + 0xc0);

                        MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0x00);
                        MEOW_MIX(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, rax + 0x20);
                        MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x40);
                        MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x60);
                        MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x80);
                        MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0xa0);
                        MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0xc0);
                        MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xe0);

                        rax += 0x100;
                    }
                }
                else
                {
                    // NOTE(casey): For small input, modern Intel x64's can't hit full speed _with_ prefetching (because of port pressure), so we use this loop.
                    while (BlockCount-- > 0)
                    {
                        MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0x00);
                        MEOW_MIX(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, rax + 0x20);
                        MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x40);
                        MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x60);
                        MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x80);
                        MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0xa0);
                        MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0xc0);
                        MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xe0);

                        rax += 0x100;
                    }
                }

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostBlocks", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                //
                // NOTE(casey): Load any less-than-32-byte residual
                //

                xmm9  = Vector128 <byte> .Zero;
                xmm11 = Vector128 <byte> .Zero;

                //
                // TODO(casey): I need to put more thought into how the end-of-buffer stuff is actually working out here,
                // because I _think_ it may be possible to remove the first branch (on Len8) and let the mask zero out the
                // result, but it would take a little thought to make sure it couldn't read off the end of the buffer due
                // to the & 0xf on the align computation.
                //

                // NOTE(casey): First, we have to load the part that is _not_ 16-byte aligned
                byte *Last = (byte *)sourceInitPtr + (Len & ~0xf);
                int   Len8 = (Len & 0xf);
                if (Len8 > 0)
                {
                    // NOTE(casey): Load the mask early
                    fixed(byte *MeowMaskLen = s_meowMaskLen)
                    {
                        xmm8 = Sse2.LoadVector128(&MeowMaskLen[0x10 - Len8]);
                    }

                    byte *LastOk = (byte *)((((ulong)(((byte *)sourceInitPtr) + Len - 1)) | (MEOW_PAGESIZE - 1)) - 16);
                    int   Align  = (Last > LastOk) ? ((int)(ulong)Last) & 0xf : 0;

                    fixed(byte *MeowShiftAdjust = s_meowShiftAdjust)
                    {
                        xmm10 = Sse2.LoadVector128(&MeowShiftAdjust[Align]);
                    }

                    xmm9 = Sse2.LoadVector128(Last - Align);
                    xmm9 = Ssse3.Shuffle(xmm9, xmm10);

                    // NOTE(jeffr): and off the extra bytes
                    xmm9 = Sse2.And(xmm9, xmm8);
                }

                // NOTE(casey): Next, we have to load the part that _is_ 16-byte aligned
                if ((Len & 0x10) != 0)
                {
                    xmm11 = xmm9;
                    xmm9  = Sse2.LoadVector128(Last - 0x10);
                }

                //
                // NOTE(casey): Construct the residual and length injests
                //

                xmm8  = xmm9;
                xmm10 = xmm9;
                xmm8  = Ssse3.AlignRight(xmm8, xmm11, 15);
                xmm10 = Ssse3.AlignRight(xmm10, xmm11, 1);

                // NOTE(casey): We have room for a 128-bit nonce and a 64-bit none here, but
                // the decision was made to leave them zero'd so as not to confuse people
                // about hwo to use them or what security implications they had.
                xmm12 = Vector128 <byte> .Zero;
                xmm13 = Vector128 <byte> .Zero;
                xmm14 = Vector128 <byte> .Zero;
                xmm15 = Vector128.Create((ulong)Len, 0).AsByte();
                xmm12 = Ssse3.AlignRight(xmm12, xmm15, 15);
                xmm14 = Ssse3.AlignRight(xmm14, xmm15, 1);

#if MEOW_DUMP
                MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
#endif

                // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty
                MEOW_MIX_REG(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, xmm8, xmm9, xmm10, xmm11);

                // NOTE(casey): Append the length, to avoid problems with our 32-byte padding
                MEOW_MIX_REG(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, xmm12, xmm13, xmm14, xmm15);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                //
                // NOTE(casey): Hash all full 32-byte blocks
                //
                int LaneCount = (Len >> 5) & 0x7;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x00); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x20); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x40); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0x60); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0x80); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xa0); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0xc0); --LaneCount;

                //
                // NOTE(casey): Mix the eight lanes down to one 128-bit hash
                //

MixDown:

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6);
                MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7);
                MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0);
                MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1);
                MEOW_SHUFFLE(ref xmm4, ref xmm5, xmm6, ref xmm0, ref xmm1, xmm2);
                MEOW_SHUFFLE(ref xmm5, ref xmm6, xmm7, ref xmm1, ref xmm2, xmm3);
                MEOW_SHUFFLE(ref xmm6, ref xmm7, xmm0, ref xmm2, ref xmm3, xmm4);
                MEOW_SHUFFLE(ref xmm7, ref xmm0, xmm1, ref xmm3, ref xmm4, xmm5);
                MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6);
                MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7);
                MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0);
                MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostMix", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                xmm0 = AddQ(xmm0, xmm2);
                xmm1 = AddQ(xmm1, xmm3);
                xmm4 = AddQ(xmm4, xmm6);
                xmm5 = AddQ(xmm5, xmm7);
                xmm0 = Sse2.Xor(xmm0, xmm1);
                xmm4 = Sse2.Xor(xmm4, xmm5);
                xmm0 = AddQ(xmm0, xmm4);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                return(xmm0);
            }
        }
예제 #10
0
        public static unsafe Vector128 <byte> End(ref State state, Span <byte> store128)
        {
            long Len = state.TotalLengthInBytes;

            Vector128 <byte> xmm0 = state.xmm0;
            Vector128 <byte> xmm1 = state.xmm1;
            Vector128 <byte> xmm2 = state.xmm2;
            Vector128 <byte> xmm3 = state.xmm3;
            Vector128 <byte> xmm4 = state.xmm4;
            Vector128 <byte> xmm5 = state.xmm5;
            Vector128 <byte> xmm6 = state.xmm6;
            Vector128 <byte> xmm7 = state.xmm7;

            Vector128 <byte> xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;

            fixed(byte *rax = state.Buffer)
            {
                xmm9  = Vector128 <byte> .Zero;
                xmm11 = Vector128 <byte> .Zero;

                byte *Last = (byte *)rax + (Len & 0xf0);
                long  Len8 = (Len & 0xf);

                if (Len8 > 0)
                {
                    fixed(byte *MeowMaskLen = s_meowMaskLen)
                    {
                        xmm8 = Sse2.LoadVector128(&MeowMaskLen[0x10 - Len8]);
                    }

                    xmm9 = Sse2.LoadVector128(Last);
                    xmm9 = Sse2.And(xmm9, xmm8);
                }

                if ((Len & 0x10) != 0)
                {
                    xmm11 = xmm9;
                    xmm9  = Sse2.LoadVector128(Last - 0x10);
                }


                xmm8  = xmm9;
                xmm10 = xmm9;
                xmm8  = Ssse3.AlignRight(xmm8, xmm11, 15);
                xmm10 = Ssse3.AlignRight(xmm10, xmm11, 1);

                xmm12 = Vector128 <byte> .Zero;
                xmm13 = Vector128 <byte> .Zero;
                xmm14 = Vector128 <byte> .Zero;
                xmm15 = Vector128.Create((ulong)Len, 0).AsByte();
                xmm12 = Ssse3.AlignRight(xmm12, xmm15, 15);
                xmm14 = Ssse3.AlignRight(xmm14, xmm15, 1);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostBlocks", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
                MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
#endif

                // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty
                MEOW_MIX_REG(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, xmm8, xmm9, xmm10, xmm11);

                // NOTE(casey): Append the length, to avoid problems with our 32-byte padding
                MEOW_MIX_REG(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, xmm12, xmm13, xmm14, xmm15);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                //
                // NOTE(casey): Hash all full 32-byte blocks
                //
                long LaneCount = (Len >> 5) & 0x7;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x00); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x20); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x40); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0x60); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0x80); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xa0); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0xc0); --LaneCount;

                //
                // NOTE(casey): Mix the eight lanes down to one 128-bit hash
                //

MixDown:

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6);
                MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7);
                MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0);
                MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1);
                MEOW_SHUFFLE(ref xmm4, ref xmm5, xmm6, ref xmm0, ref xmm1, xmm2);
                MEOW_SHUFFLE(ref xmm5, ref xmm6, xmm7, ref xmm1, ref xmm2, xmm3);
                MEOW_SHUFFLE(ref xmm6, ref xmm7, xmm0, ref xmm2, ref xmm3, xmm4);
                MEOW_SHUFFLE(ref xmm7, ref xmm0, xmm1, ref xmm3, ref xmm4, xmm5);
                MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6);
                MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7);
                MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0);
                MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostMix", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                if (store128 != null)
                {
                    fixed(byte *store128Ptr = store128)
                    {
                        Sse2.Store(store128Ptr + 0x00, xmm0);
                        Sse2.Store(store128Ptr + 0x10, xmm1);
                        Sse2.Store(store128Ptr + 0x20, xmm2);
                        Sse2.Store(store128Ptr + 0x30, xmm3);
                        Sse2.Store(store128Ptr + 0x40, xmm4);
                        Sse2.Store(store128Ptr + 0x50, xmm5);
                        Sse2.Store(store128Ptr + 0x60, xmm6);
                        Sse2.Store(store128Ptr + 0x70, xmm7);
                    }
                }

                xmm0 = AddQ(xmm0, xmm2);
                xmm1 = AddQ(xmm1, xmm3);
                xmm4 = AddQ(xmm4, xmm6);
                xmm5 = AddQ(xmm5, xmm7);
                xmm0 = Sse2.Xor(xmm0, xmm1);
                xmm4 = Sse2.Xor(xmm4, xmm5);
                xmm0 = AddQ(xmm0, xmm4);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                return(xmm0);
            }
        }
예제 #11
0
 private static Vector128 <ulong> alignr_ulong(ref Vector128 <ulong> x, ref Vector128 <ulong> y, byte m) =>
 Ssse3.AlignRight(x.AsSByte(), y.AsSByte(), m).AsUInt64();
예제 #12
0
 public static Vector128 <uint> _mm_alignr_epi8(Vector128 <uint> left, Vector128 <uint> right, byte mask)
 {
     return(Ssse3.AlignRight(left, right, mask));
 }
예제 #13
0
 private static Vector128 <ulong> alignr_ulong(ref Vector128 <ulong> x, ref Vector128 <ulong> y, byte m) =>
 Ssse3.AlignRight(x.As <sbyte>(), y.As <sbyte>(), m).As <ulong>();
예제 #14
0
        // DecryptPowersTable decrypts ctLen bytes from ct and writes them to pt. While
        // decrypting, it updates the POLYVAL value in polyval. In order to decrypt and
        // update the POLYVAL value, it uses the expanded key from ks and the table of
        // powers in htbl. Decryption processes 6 blocks of data in parallel.
        private static void DecryptPowersTable(byte *ct, int ctLen, byte *pt, byte *polyval, byte *htbl, byte *tag, byte *ks)
        {
            Vector128 <ulong> sCtr1, sCtr2, sCtr3, sCtr4, sCtr5, sCtr6, tmp0, tmp1, tmp2, tmp3, tmp4, h;

            var poly = Sse.StaticCast <uint, ulong>(Sse2.SetVector128(0xc2000000, 0, 0, 1));
            var t    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(polyval));

            var orMask = Sse.StaticCast <uint, byte>(Sse2.SetVector128(0x80000000, 0, 0, 0));
            var ctr    = Sse2.Or(Sse2.LoadVector128(tag), orMask);

            var one = Sse2.SetVector128(0, 0, 0, 1);
            var two = Sse2.SetVector128(0, 0, 0, 2);

            int blocks = 0;

            if (ctLen >= 96)
            {
                var ctr1 = ctr;
                var ctr2 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), one));
                var ctr3 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), two));
                var ctr4 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), one));
                var ctr5 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), two));
                var ctr6 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), one));
                ctr = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), two));

                var key = Sse2.LoadVector128(ks);
                ctr1 = Sse2.Xor(ctr1, key);
                ctr2 = Sse2.Xor(ctr2, key);
                ctr3 = Sse2.Xor(ctr3, key);
                ctr4 = Sse2.Xor(ctr4, key);
                ctr5 = Sse2.Xor(ctr5, key);
                ctr6 = Sse2.Xor(ctr6, key);

                for (int i = 1; i < 14; ++i)
                {
                    key  = Sse2.LoadVector128(&ks[i * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);
                }

                key  = Sse2.LoadVector128(&ks[14 * 16]);
                ctr1 = Aes.EncryptLast(ctr1, key);
                ctr2 = Aes.EncryptLast(ctr2, key);
                ctr3 = Aes.EncryptLast(ctr3, key);
                ctr4 = Aes.EncryptLast(ctr4, key);
                ctr5 = Aes.EncryptLast(ctr5, key);
                ctr6 = Aes.EncryptLast(ctr6, key);

                ctr1 = Sse2.Xor(ctr1, Sse2.LoadVector128(&ct[0 * 16]));
                ctr2 = Sse2.Xor(ctr2, Sse2.LoadVector128(&ct[1 * 16]));
                ctr3 = Sse2.Xor(ctr3, Sse2.LoadVector128(&ct[2 * 16]));
                ctr4 = Sse2.Xor(ctr4, Sse2.LoadVector128(&ct[3 * 16]));
                ctr5 = Sse2.Xor(ctr5, Sse2.LoadVector128(&ct[4 * 16]));
                ctr6 = Sse2.Xor(ctr6, Sse2.LoadVector128(&ct[5 * 16]));

                Sse2.Store(&pt[0 * 16], ctr1);
                Sse2.Store(&pt[1 * 16], ctr2);
                Sse2.Store(&pt[2 * 16], ctr3);
                Sse2.Store(&pt[3 * 16], ctr4);
                Sse2.Store(&pt[4 * 16], ctr5);
                Sse2.Store(&pt[5 * 16], ctr6);

                ctLen  -= 96;
                blocks += 6;

                while (ctLen >= 96)
                {
                    sCtr6 = Sse.StaticCast <byte, ulong>(ctr6);
                    sCtr5 = Sse.StaticCast <byte, ulong>(ctr5);
                    sCtr4 = Sse.StaticCast <byte, ulong>(ctr4);
                    sCtr3 = Sse.StaticCast <byte, ulong>(ctr3);
                    sCtr2 = Sse.StaticCast <byte, ulong>(ctr2);
                    sCtr1 = Sse.StaticCast <byte, ulong>(ctr1);

                    ctr1 = ctr;
                    ctr2 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), one));
                    ctr3 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), two));
                    ctr4 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), one));
                    ctr5 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr3), two));
                    ctr6 = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), one));
                    ctr  = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr5), two));

                    key  = Sse2.LoadVector128(ks);
                    ctr1 = Sse2.Xor(ctr1, key);
                    ctr2 = Sse2.Xor(ctr2, key);
                    ctr3 = Sse2.Xor(ctr3, key);
                    ctr4 = Sse2.Xor(ctr4, key);
                    ctr5 = Sse2.Xor(ctr5, key);
                    ctr6 = Sse2.Xor(ctr6, key);

                    tmp3 = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl));
                    tmp1 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x11);
                    tmp2 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x00);
                    tmp0 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x01);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x10);
                    tmp0 = Sse2.Xor(tmp3, tmp0);

                    key  = Sse2.LoadVector128(&ks[1 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16]));
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x10);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x00);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x01);
                    tmp0 = Sse2.Xor(tmp0, tmp3);

                    key  = Sse2.LoadVector128(&ks[2 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16]));
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x10);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x00);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x01);
                    tmp0 = Sse2.Xor(tmp0, tmp3);

                    key  = Sse2.LoadVector128(&ks[3 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16]));
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x10);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x00);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x01);
                    tmp0 = Sse2.Xor(tmp0, tmp3);

                    key  = Sse2.LoadVector128(&ks[4 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16]));
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x10);
                    tmp0 = Sse2.Xor(tmp0, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x11);
                    tmp1 = Sse2.Xor(tmp1, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x00);
                    tmp2 = Sse2.Xor(tmp2, tmp3);
                    tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x01);
                    tmp0 = Sse2.Xor(tmp0, tmp3);

                    key  = Sse2.LoadVector128(&ks[5 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    key  = Sse2.LoadVector128(&ks[6 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    key  = Sse2.LoadVector128(&ks[7 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    sCtr1 = Sse2.Xor(t, sCtr1);
                    tmp4  = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16]));
                    tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x01);
                    tmp0  = Sse2.Xor(tmp3, tmp0);
                    tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x11);
                    tmp1  = Sse2.Xor(tmp3, tmp1);
                    tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x00);
                    tmp2  = Sse2.Xor(tmp3, tmp2);
                    tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x10);
                    tmp0  = Sse2.Xor(tmp3, tmp0);

                    key  = Sse2.LoadVector128(&ks[8 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    tmp3 = Sse2.ShiftRightLogical128BitLane(tmp0, 8);
                    tmp4 = Sse2.Xor(tmp3, tmp1);
                    tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp0, 8);
                    t    = Sse2.Xor(tmp3, tmp2);

                    key  = Sse2.LoadVector128(&ks[9 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                    t    = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                    t    = Sse2.Xor(tmp1, t);

                    key  = Sse2.LoadVector128(&ks[10 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    key  = Sse2.LoadVector128(&ks[11 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    key  = Sse2.LoadVector128(&ks[12 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    key  = Sse2.LoadVector128(&ks[13 * 16]);
                    ctr1 = Aes.Encrypt(ctr1, key);
                    ctr2 = Aes.Encrypt(ctr2, key);
                    ctr3 = Aes.Encrypt(ctr3, key);
                    ctr4 = Aes.Encrypt(ctr4, key);
                    ctr5 = Aes.Encrypt(ctr5, key);
                    ctr6 = Aes.Encrypt(ctr6, key);

                    key  = Sse2.LoadVector128(&ks[14 * 16]);
                    ctr1 = Aes.EncryptLast(ctr1, key);
                    ctr2 = Aes.EncryptLast(ctr2, key);
                    ctr3 = Aes.EncryptLast(ctr3, key);
                    ctr4 = Aes.EncryptLast(ctr4, key);
                    ctr5 = Aes.EncryptLast(ctr5, key);
                    ctr6 = Aes.EncryptLast(ctr6, key);

                    ctr1 = Sse2.Xor(ctr1, Sse2.LoadVector128(&ct[(blocks + 0) * 16]));
                    ctr2 = Sse2.Xor(ctr2, Sse2.LoadVector128(&ct[(blocks + 1) * 16]));
                    ctr3 = Sse2.Xor(ctr3, Sse2.LoadVector128(&ct[(blocks + 2) * 16]));
                    ctr4 = Sse2.Xor(ctr4, Sse2.LoadVector128(&ct[(blocks + 3) * 16]));
                    ctr5 = Sse2.Xor(ctr5, Sse2.LoadVector128(&ct[(blocks + 4) * 16]));
                    ctr6 = Sse2.Xor(ctr6, Sse2.LoadVector128(&ct[(blocks + 5) * 16]));

                    tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                    t    = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                    t    = Sse2.Xor(tmp1, t);
                    t    = Sse2.Xor(tmp4, t);

                    Sse2.Store(&pt[(blocks + 0) * 16], ctr1);
                    Sse2.Store(&pt[(blocks + 1) * 16], ctr2);
                    Sse2.Store(&pt[(blocks + 2) * 16], ctr3);
                    Sse2.Store(&pt[(blocks + 3) * 16], ctr4);
                    Sse2.Store(&pt[(blocks + 4) * 16], ctr5);
                    Sse2.Store(&pt[(blocks + 5) * 16], ctr6);

                    ctLen  -= 96;
                    blocks += 6;
                }

                sCtr6 = Sse.StaticCast <byte, ulong>(ctr6);
                sCtr5 = Sse.StaticCast <byte, ulong>(ctr5);
                sCtr4 = Sse.StaticCast <byte, ulong>(ctr4);
                sCtr3 = Sse.StaticCast <byte, ulong>(ctr3);
                sCtr2 = Sse.StaticCast <byte, ulong>(ctr2);
                sCtr1 = Sse.StaticCast <byte, ulong>(ctr1);

                tmp3 = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl));
                tmp0 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x10);
                tmp1 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x11);
                tmp2 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x00);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr6, tmp3, 0x01);
                tmp0 = Sse2.Xor(tmp3, tmp0);

                h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16]));
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x10);
                tmp0 = Sse2.Xor(tmp0, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x11);
                tmp1 = Sse2.Xor(tmp1, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x00);
                tmp2 = Sse2.Xor(tmp2, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr5, h, 0x01);
                tmp0 = Sse2.Xor(tmp0, tmp3);

                h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16]));
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x10);
                tmp0 = Sse2.Xor(tmp0, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x11);
                tmp1 = Sse2.Xor(tmp1, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x00);
                tmp2 = Sse2.Xor(tmp2, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr4, h, 0x01);
                tmp0 = Sse2.Xor(tmp0, tmp3);

                h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16]));
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x10);
                tmp0 = Sse2.Xor(tmp0, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x11);
                tmp1 = Sse2.Xor(tmp1, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x00);
                tmp2 = Sse2.Xor(tmp2, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr3, h, 0x01);
                tmp0 = Sse2.Xor(tmp0, tmp3);

                h    = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16]));
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x10);
                tmp0 = Sse2.Xor(tmp0, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x11);
                tmp1 = Sse2.Xor(tmp1, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x00);
                tmp2 = Sse2.Xor(tmp2, tmp3);
                tmp3 = Pclmulqdq.CarrylessMultiply(sCtr2, h, 0x01);
                tmp0 = Sse2.Xor(tmp0, tmp3);

                sCtr1 = Sse2.Xor(t, sCtr1);
                tmp4  = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16]));
                tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x11);
                tmp1  = Sse2.Xor(tmp3, tmp1);
                tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x00);
                tmp2  = Sse2.Xor(tmp3, tmp2);
                tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x10);
                tmp0  = Sse2.Xor(tmp3, tmp0);
                tmp3  = Pclmulqdq.CarrylessMultiply(sCtr1, tmp4, 0x01);
                tmp0  = Sse2.Xor(tmp3, tmp0);

                tmp3 = Sse2.ShiftRightLogical128BitLane(tmp0, 8);
                tmp4 = Sse2.Xor(tmp3, tmp1);
                tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp0, 8);
                t    = Sse2.Xor(tmp3, tmp2);

                tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                t    = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                t    = Sse2.Xor(tmp1, t);
                tmp1 = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8));
                t    = Pclmulqdq.CarrylessMultiply(t, poly, 0x10);
                t    = Sse2.Xor(tmp1, t);
                t    = Sse2.Xor(tmp4, t);
            }

            h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl));

            while (ctLen >= 16)
            {
                var tmp = ctr;
                ctr = Sse.StaticCast <int, byte>(Sse2.Add(Sse.StaticCast <byte, int>(ctr), one));
                tmp = Sse2.Xor(tmp, Sse2.LoadVector128(ks));

                for (int i = 1; i < 14; ++i)
                {
                    tmp = Aes.Encrypt(tmp, Sse2.LoadVector128(&ks[i * 16]));
                }

                tmp = Aes.EncryptLast(tmp, Sse2.LoadVector128(&ks[14 * 16]));
                tmp = Sse2.Xor(tmp, Sse2.LoadVector128(&ct[blocks * 16]));
                Sse2.Store(&pt[blocks * 16], tmp);

                t    = Sse2.Xor(Sse.StaticCast <byte, ulong>(tmp), t);
                tmp1 = Pclmulqdq.CarrylessMultiply(t, h, 0x00);
                tmp4 = Pclmulqdq.CarrylessMultiply(t, h, 0x11);
                tmp2 = Pclmulqdq.CarrylessMultiply(t, h, 0x10);
                tmp3 = Pclmulqdq.CarrylessMultiply(t, h, 0x01);
                tmp2 = Sse2.Xor(tmp3, tmp2);
                tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                tmp2 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                tmp1 = Sse2.Xor(tmp1, tmp3);
                tmp4 = Sse2.Xor(tmp2, tmp4);

                tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10);
                tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78));
                tmp1 = Sse2.Xor(tmp2, tmp3);
                tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10);
                tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78));
                tmp1 = Sse2.Xor(tmp2, tmp3);
                t    = Sse2.Xor(tmp1, tmp4);

                ctLen -= 16;
                ++blocks;
            }

            if (ctLen > 0)
            {
                byte *b = stackalloc byte[16];
                new Span <byte>(ct + blocks * 16, ctLen).CopyTo(new Span <byte>(b, 16));
                var tmp = Sse2.Xor(ctr, Sse2.LoadVector128(ks));

                for (int i = 1; i < 14; ++i)
                {
                    tmp = Aes.Encrypt(tmp, Sse2.LoadVector128(&ks[i * 16]));
                }

                tmp = Aes.EncryptLast(tmp, Sse2.LoadVector128(&ks[14 * 16]));
                tmp = Sse2.Xor(tmp, Sse2.LoadVector128(b));
                Sse2.Store(b, tmp);

                new Span <byte>(b, ctLen).CopyTo(new Span <byte>(&pt[blocks * 16], ctLen));
                new Span <byte>(b + ctLen, 16 - ctLen).Clear();

                t    = Sse2.Xor(Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(b)), t);
                tmp1 = Pclmulqdq.CarrylessMultiply(t, h, 0x00);
                tmp4 = Pclmulqdq.CarrylessMultiply(t, h, 0x11);
                tmp2 = Pclmulqdq.CarrylessMultiply(t, h, 0x10);
                tmp3 = Pclmulqdq.CarrylessMultiply(t, h, 0x01);
                tmp2 = Sse2.Xor(tmp3, tmp2);
                tmp3 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8);
                tmp2 = Sse2.ShiftRightLogical128BitLane(tmp2, 8);
                tmp1 = Sse2.Xor(tmp1, tmp3);
                tmp4 = Sse2.Xor(tmp2, tmp4);

                tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10);
                tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78));
                tmp1 = Sse2.Xor(tmp2, tmp3);
                tmp2 = Pclmulqdq.CarrylessMultiply(tmp1, poly, 0x10);
                tmp3 = Sse.StaticCast <uint, ulong>(Sse2.Shuffle(Sse.StaticCast <ulong, uint>(tmp1), 78));
                tmp1 = Sse2.Xor(tmp2, tmp3);
                t    = Sse2.Xor(tmp1, tmp4);
            }

            Sse2.Store(polyval, Sse.StaticCast <ulong, byte>(t));
        }