Esempio n. 1
0
        unsafe private static void compress(Blake2bContext *s, byte *input)
        {
            ulong *m = (ulong *)input;

#if FAST_SPAN
            if (!BitConverter.IsLittleEndian)
            {
                var span = new ReadOnlySpan <byte>(input, BlockBytes);
                m = (ulong *)s->b;
                for (int i = 0; i < BlockWords; i++)
                {
                    m[i] = BinaryPrimitives.ReadUInt64LittleEndian(span.Slice(i * WordSize, WordSize));
                }
            }
#endif

#if USE_INTRINSICS
#if USE_AVX2
            if (Avx2.IsSupported)
            {
                mixAvx2(s, m);
            }
            else
#endif
            if (Sse41.IsSupported)
            {
                mixSse41(s, m);
            }
            else
#endif
            mixScalar(s, m);
        }
Esempio n. 2
0
        unsafe private static void compress(Blake2bContext *s, byte *input)
        {
            ulong *m = (ulong *)input;

#if NETCOREAPP3_0
            if (false && Sse41.IsSupported)
            {
                mixSse41(s, m);
            }
            else
#endif
            {
                mixScalar(s, m);
            }
        }
    private static void mixSimplified(Blake2bContext *s, ulong *m)
    {
        ulong m00 = m[00];
        ulong m01 = m[01];
        ulong m02 = m[02];
        ulong m03 = m[03];
        ulong m04 = m[04];
        ulong m05 = m[05];
        ulong m06 = m[06];
        ulong m07 = m[07];
        ulong m08 = m[08];
        ulong m09 = m[09];
        ulong m10 = m[10];
        ulong m11 = m[11];
        ulong m12 = m[12];
        ulong m13 = m[13];
        ulong m14 = m[14];
        ulong m15 = m[15];

        ulong v00 = s->h[0];
        ulong v01 = s->h[1];
        ulong v02 = s->h[2];
        ulong v03 = s->h[3];
        ulong v04 = s->h[4];
        ulong v05 = s->h[5];
        ulong v06 = s->h[6];
        ulong v07 = s->h[7];

        ulong v08 = 0x6A09E667F3BCC908ul;
        ulong v09 = 0xBB67AE8584CAA73Bul;
        ulong v10 = 0x3C6EF372FE94F82Bul;
        ulong v11 = 0xA54FF53A5F1D36F1ul;
        ulong v12 = 0x510E527FADE682D1ul;
        ulong v13 = 0x9B05688C2B3E6C1Ful;
        ulong v14 = 0x1F83D9ABFB41BD6Bul;
        ulong v15 = 0x5BE0CD19137E2179ul;

        v12 ^= s->t[0];
        v13 ^= s->t[1];
        v14 ^= s->f[0];

        //ROUND 1 (first half)
        v00 += m00;
        v00 += v04;
        v12 ^= v00;
        v08 += v12;
        v04 ^= v08;

        v01 += m02;
        v01 += v05;
        v13 ^= v01;
        v09 += v13;
        v05 ^= v09;

        v02 += m04;
        v02 += v06;
        v14 ^= v02;
        v10 += v14;
        v06 ^= v10;

        v03 += m06;
        v03 += v07;
        v15 ^= v03;
        v11 += v15;
        v07 ^= v11;

        v00 += m08;
        v00 += v05;
        v15 ^= v00;
        v10 += v15;
        v05 ^= v10;

        v01 += m10;
        v01 += v06;
        v12 ^= v01;
        v11 += v12;
        v06 ^= v11;

        v02 += m12;
        v02 += v07;
        v13 ^= v02;
        v08 += v13;
        v07 ^= v08;

        v03 += m14;
        v03 += v04;
        v14 ^= v03;
        v09 += v14;
        v04 ^= v09;

        s->h[0] ^= v00 ^ v08;
        s->h[1] ^= v01 ^ v09;
        s->h[2] ^= v02 ^ v10;
        s->h[3] ^= v03 ^ v11;
        s->h[4] ^= v04 ^ v12;
        s->h[5] ^= v05 ^ v13;
        s->h[6] ^= v06 ^ v14;
        s->h[7] ^= v07 ^ v15;
    }
Esempio n. 4
0
    private void compress(Blake2bContext *s, byte *data)
    {
        ulong *m = (ulong *)data;

        mixSimplified(s, m);
    }
Esempio n. 5
0
        unsafe private static void mixSse41(Blake2bContext *s, ulong *m)
        {
            var hptr  = s->htf.h;
            var row1l = Unsafe.As <ulong, Vector128 <ulong> >(ref hptr[0]);
            var row1h = Unsafe.As <ulong, Vector128 <ulong> >(ref hptr[2]);
            var row2l = Unsafe.As <ulong, Vector128 <ulong> >(ref hptr[4]);
            var row2h = Unsafe.As <ulong, Vector128 <ulong> >(ref hptr[6]);

            var row3l = Unsafe.As <ulong, Vector128 <ulong> >(ref V.iv[0]);
            var row3h = Unsafe.As <ulong, Vector128 <ulong> >(ref V.iv[2]);
            var row4l = Unsafe.As <ulong, Vector128 <ulong> >(ref V.iv[4]);
            var row4h = Unsafe.As <ulong, Vector128 <ulong> >(ref V.iv[6]);

            row4l = Sse2.Xor(row4l, Sse2.LoadVector128(s->htf.t));
            row4h = Sse2.Xor(row4h, Sse2.LoadVector128(s->htf.f));

            //ROUND 1
            var m0 = Sse2.LoadVector128(m);
            var m1 = Sse2.LoadVector128(m + 2);
            var m2 = Sse2.LoadVector128(m + 4);
            var m3 = Sse2.LoadVector128(m + 6);

            var b0 = Sse2.UnpackLow(m0, m1);
            var b1 = Sse2.UnpackLow(m2, m3);

            var r16 = Sse2.LoadVector128((sbyte *)V.rm);
            var r24 = Sse2.LoadVector128((sbyte *)V.rm + 16);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = Sse2.UnpackHigh(m0, m1);
            b1 = Sse2.UnpackHigh(m2, m3);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            diagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            var m4 = Sse2.LoadVector128(m + 8);
            var m5 = Sse2.LoadVector128(m + 10);
            var m6 = Sse2.LoadVector128(m + 12);
            var m7 = Sse2.LoadVector128(m + 14);

            b0 = Sse2.UnpackLow(m4, m5);
            b1 = Sse2.UnpackLow(m6, m7);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = Sse2.UnpackHigh(m4, m5);
            b1 = Sse2.UnpackHigh(m6, m7);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            undiagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            //ROUND 2
            b0 = Sse2.UnpackLow(m7, m2);
            b1 = Sse2.UnpackHigh(m4, m6);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = Sse2.UnpackLow(m5, m4);
            b1 = alignr_ulong(ref m3, ref m7, 8);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            diagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            b0 = shuffle_ulong(ref m0, 0b_01_00_11_10);
            b1 = Sse2.UnpackHigh(m5, m2);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = Sse2.UnpackLow(m6, m1);
            b1 = Sse2.UnpackHigh(m3, m1);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            undiagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            //ROUND 3
            b0 = alignr_ulong(ref m6, ref m5, 8);
            b1 = Sse2.UnpackHigh(m2, m7);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = Sse2.UnpackLow(m4, m0);
            b1 = blend_ulong(ref m1, ref m6, 0b_1111_0000);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            diagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            b0 = blend_ulong(ref m5, ref m1, 0b_1111_0000);
            b1 = Sse2.UnpackHigh(m3, m4);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = Sse2.UnpackLow(m7, m3);
            b1 = alignr_ulong(ref m2, ref m0, 8);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            undiagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            //ROUND 4
            b0 = Sse2.UnpackHigh(m3, m1);
            b1 = Sse2.UnpackHigh(m6, m5);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = Sse2.UnpackHigh(m4, m0);
            b1 = Sse2.UnpackLow(m6, m7);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            diagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            b0 = blend_ulong(ref m1, ref m2, 0b_1111_0000);
            b1 = blend_ulong(ref m2, ref m7, 0b_1111_0000);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = Sse2.UnpackLow(m3, m5);
            b1 = Sse2.UnpackLow(m0, m4);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            undiagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            //ROUND 5
            b0 = Sse2.UnpackHigh(m4, m2);
            b1 = Sse2.UnpackLow(m1, m5);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = blend_ulong(ref m0, ref m3, 0b_1111_0000);
            b1 = blend_ulong(ref m2, ref m7, 0b_1111_0000);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            diagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            b0 = blend_ulong(ref m7, ref m5, 0b_1111_0000);
            b1 = blend_ulong(ref m3, ref m1, 0b_1111_0000);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = alignr_ulong(ref m6, ref m0, 8);
            b1 = blend_ulong(ref m4, ref m6, 0b_1111_0000);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            undiagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            //ROUND 6
            b0 = Sse2.UnpackLow(m1, m3);
            b1 = Sse2.UnpackLow(m0, m4);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = Sse2.UnpackLow(m6, m5);
            b1 = Sse2.UnpackHigh(m5, m1);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            diagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            b0 = blend_ulong(ref m2, ref m3, 0b_1111_0000);
            b1 = Sse2.UnpackHigh(m7, m0);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = Sse2.UnpackHigh(m6, m2);
            b1 = blend_ulong(ref m7, ref m4, 0b_1111_0000);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            undiagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            //ROUND 7
            b0 = blend_ulong(ref m6, ref m0, 0b_1111_0000);
            b1 = Sse2.UnpackLow(m7, m2);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = Sse2.UnpackHigh(m2, m7);
            b1 = alignr_ulong(ref m5, ref m6, 8);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            diagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            b0 = Sse2.UnpackLow(m0, m3);
            b1 = shuffle_ulong(ref m4, 0b_01_00_11_10);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = Sse2.UnpackHigh(m3, m1);
            b1 = blend_ulong(ref m1, ref m5, 0b_1111_0000);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            undiagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            //ROUND 8
            b0 = Sse2.UnpackHigh(m6, m3);
            b1 = blend_ulong(ref m6, ref m1, 0b_1111_0000);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = alignr_ulong(ref m7, ref m5, 8);
            b1 = Sse2.UnpackHigh(m0, m4);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            diagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            b0 = Sse2.UnpackHigh(m2, m7);
            b1 = Sse2.UnpackLow(m4, m1);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = Sse2.UnpackLow(m0, m2);
            b1 = Sse2.UnpackLow(m3, m5);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            undiagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            //ROUND 9
            b0 = Sse2.UnpackLow(m3, m7);
            b1 = alignr_ulong(ref m0, ref m5, 8);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = Sse2.UnpackHigh(m7, m4);
            b1 = alignr_ulong(ref m4, ref m1, 8);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            diagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            b0 = m6;
            b1 = alignr_ulong(ref m5, ref m0, 8);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = blend_ulong(ref m1, ref m3, 0b_1111_0000);
            b1 = m2;

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            undiagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            //ROUND 10
            b0 = Sse2.UnpackLow(m5, m4);
            b1 = Sse2.UnpackHigh(m3, m0);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = Sse2.UnpackLow(m1, m2);
            b1 = blend_ulong(ref m3, ref m2, 0b_1111_0000);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            diagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            b0 = Sse2.UnpackHigh(m7, m4);
            b1 = Sse2.UnpackHigh(m1, m6);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = alignr_ulong(ref m7, ref m5, 8);
            b1 = Sse2.UnpackLow(m6, m0);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            undiagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            //ROUND 11
            b0 = Sse2.UnpackLow(m0, m1);
            b1 = Sse2.UnpackLow(m2, m3);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = Sse2.UnpackHigh(m0, m1);
            b1 = Sse2.UnpackHigh(m2, m3);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            diagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            b0 = Sse2.UnpackLow(m4, m5);
            b1 = Sse2.UnpackLow(m6, m7);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = Sse2.UnpackHigh(m4, m5);
            b1 = Sse2.UnpackHigh(m6, m7);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            undiagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            //ROUND 12
            b0 = Sse2.UnpackLow(m7, m2);
            b1 = Sse2.UnpackHigh(m4, m6);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = Sse2.UnpackLow(m5, m4);
            b1 = alignr_ulong(ref m3, ref m7, 8);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            diagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            b0 = shuffle_ulong(ref m0, 0b_01_00_11_10);
            b1 = Sse2.UnpackHigh(m5, m2);

            g1(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r24);

            b0 = Sse2.UnpackLow(m6, m1);
            b1 = Sse2.UnpackHigh(m3, m1);

            g2(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0, ref b1, ref r16);
            undiagonalize(ref row1l, ref row2l, ref row3l, ref row4l, ref row1h, ref row2h, ref row3h, ref row4h, ref b0);

            row1l = Sse2.Xor(row1l, row3l);
            row1h = Sse2.Xor(row1h, row3h);
            row1l = Sse2.Xor(row1l, Sse2.LoadVector128(hptr));
            row1h = Sse2.Xor(row1h, Sse2.LoadVector128(hptr + 2));
            Sse2.Store(hptr, row1l);
            Sse2.Store(hptr + 2, row1h);

            row2l = Sse2.Xor(row2l, row4l);
            row2h = Sse2.Xor(row2h, row4h);
            row2l = Sse2.Xor(row2l, Sse2.LoadVector128(hptr + 4));
            row2h = Sse2.Xor(row2h, Sse2.LoadVector128(hptr + 6));
            Sse2.Store(hptr + 4, row2l);
            Sse2.Store(hptr + 6, row2h);
        }
Esempio n. 6
0
        unsafe private static void mixAvx2(Blake2bContext *s, ulong *m)
        {
            var row1 = Avx.LoadVector256(s->h);
            var row2 = Avx.LoadVector256(s->h + 4);

            var row3 = v256iv0;
            var row4 = v256iv1;

            row4 = Avx2.Xor(row4, Avx.LoadVector256(s->t));             // reads into f[] as well

            //ROUND 1
            var m0 = Avx2.BroadcastVector128ToVector256(m);
            var m1 = Avx2.BroadcastVector128ToVector256(m + 2);
            var m2 = Avx2.BroadcastVector128ToVector256(m + 4);
            var m3 = Avx2.BroadcastVector128ToVector256(m + 6);

            var r24 = v256rm0;
            var r16 = v256rm1;

            var t0 = Avx2.UnpackLow(m0, m1);
            var t1 = Avx2.UnpackLow(m2, m3);
            var b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m0, m1);
            t1 = Avx2.UnpackHigh(m2, m3);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            var m4 = Avx2.BroadcastVector128ToVector256(m + 8);
            var m5 = Avx2.BroadcastVector128ToVector256(m + 10);
            var m6 = Avx2.BroadcastVector128ToVector256(m + 12);
            var m7 = Avx2.BroadcastVector128ToVector256(m + 14);

            t0 = Avx2.UnpackLow(m4, m5);
            t1 = Avx2.UnpackLow(m6, m7);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m4, m5);
            t1 = Avx2.UnpackHigh(m6, m7);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 2
            t0 = Avx2.UnpackLow(m7, m2);
            t1 = Avx2.UnpackHigh(m4, m6);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m5, m4);
            t1 = Avx2.AlignRight(m3, m7, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.Shuffle(m0.AsUInt32(), 0b_01_00_11_10).AsUInt64();
            t1 = Avx2.UnpackHigh(m5, m2);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m6, m1);
            t1 = Avx2.UnpackHigh(m3, m1);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 3
            t0 = Avx2.AlignRight(m6, m5, 8);
            t1 = Avx2.UnpackHigh(m2, m7);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m4, m0);
            t1 = Avx2.Blend(m1.AsUInt32(), m6.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.Blend(m5.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64();
            t1 = Avx2.UnpackHigh(m3, m4);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m7, m3);
            t1 = Avx2.AlignRight(m2, m0, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 4
            t0 = Avx2.UnpackHigh(m3, m1);
            t1 = Avx2.UnpackHigh(m6, m5);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m4, m0);
            t1 = Avx2.UnpackLow(m6, m7);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.Blend(m1.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64();
            t1 = Avx2.Blend(m2.AsUInt32(), m7.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m3, m5);
            t1 = Avx2.UnpackLow(m0, m4);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 5
            t0 = Avx2.UnpackHigh(m4, m2);
            t1 = Avx2.UnpackLow(m1, m5);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.Blend(m0.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64();
            t1 = Avx2.Blend(m2.AsUInt32(), m7.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.Blend(m7.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64();
            t1 = Avx2.Blend(m3.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.AlignRight(m6, m0, 8);
            t1 = Avx2.Blend(m4.AsUInt32(), m6.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 6
            t0 = Avx2.UnpackLow(m1, m3);
            t1 = Avx2.UnpackLow(m0, m4);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m6, m5);
            t1 = Avx2.UnpackHigh(m5, m1);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.Blend(m2.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64();
            t1 = Avx2.UnpackHigh(m7, m0);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m6, m2);
            t1 = Avx2.Blend(m7.AsUInt32(), m4.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 7
            t0 = Avx2.Blend(m6.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64();
            t1 = Avx2.UnpackLow(m7, m2);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m2, m7);
            t1 = Avx2.AlignRight(m5, m6, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.UnpackLow(m0, m3);
            t1 = Avx2.Shuffle(m4.AsUInt32(), 0b_01_00_11_10).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m3, m1);
            t1 = Avx2.Blend(m1.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 8
            t0 = Avx2.UnpackHigh(m6, m3);
            t1 = Avx2.Blend(m6.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.AlignRight(m7, m5, 8);
            t1 = Avx2.UnpackHigh(m0, m4);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.UnpackHigh(m2, m7);
            t1 = Avx2.UnpackLow(m4, m1);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m0, m2);
            t1 = Avx2.UnpackLow(m3, m5);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 9
            t0 = Avx2.UnpackLow(m3, m7);
            t1 = Avx2.AlignRight(m0, m5, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m7, m4);
            t1 = Avx2.AlignRight(m4, m1, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = m6;
            t1 = Avx2.AlignRight(m5, m0, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64();
            t1 = m2;
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 10
            t0 = Avx2.UnpackLow(m5, m4);
            t1 = Avx2.UnpackHigh(m3, m0);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m1, m2);
            t1 = Avx2.Blend(m3.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.UnpackHigh(m7, m4);
            t1 = Avx2.UnpackHigh(m1, m6);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.AlignRight(m7, m5, 8);
            t1 = Avx2.UnpackLow(m6, m0);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 11
            t0 = Avx2.UnpackLow(m0, m1);
            t1 = Avx2.UnpackLow(m2, m3);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m0, m1);
            t1 = Avx2.UnpackHigh(m2, m3);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.UnpackLow(m4, m5);
            t1 = Avx2.UnpackLow(m6, m7);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m4, m5);
            t1 = Avx2.UnpackHigh(m6, m7);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 12
            t0 = Avx2.UnpackLow(m7, m2);
            t1 = Avx2.UnpackHigh(m4, m6);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m5, m4);
            t1 = Avx2.AlignRight(m3, m7, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.Shuffle(m0.AsUInt32(), 0b_01_00_11_10).AsUInt64();
            t1 = Avx2.UnpackHigh(m5, m2);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m6, m1);
            t1 = Avx2.UnpackHigh(m3, m1);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            row1 = Avx2.Xor(row1, row3);
            row2 = Avx2.Xor(row2, row4);
            row1 = Avx2.Xor(row1, Avx2.LoadVector256(s->h));
            row2 = Avx2.Xor(row2, Avx2.LoadVector256(s->h + 4));

            Avx2.Store(s->h, row1);
            Avx2.Store(s->h + 4, row2);
        }
Esempio n. 7
0
        unsafe private static void mixScalar(Blake2bContext *s, ulong *m)
        {
            unchecked
            {
                ulong m00 = m[00];
                ulong m01 = m[01];
                ulong m02 = m[02];
                ulong m03 = m[03];
                ulong m04 = m[04];
                ulong m05 = m[05];
                ulong m06 = m[06];
                ulong m07 = m[07];
                ulong m08 = m[08];
                ulong m09 = m[09];
                ulong m10 = m[10];
                ulong m11 = m[11];
                ulong m12 = m[12];
                ulong m13 = m[13];
                ulong m14 = m[14];
                ulong m15 = m[15];

                ulong v00 = s->htf.h[0];
                ulong v01 = s->htf.h[1];
                ulong v02 = s->htf.h[2];
                ulong v03 = s->htf.h[3];
                ulong v04 = s->htf.h[4];
                ulong v05 = s->htf.h[5];
                ulong v06 = s->htf.h[6];
                ulong v07 = s->htf.h[7];

                ulong v08 = 0x6A09E667F3BCC908ul;
                ulong v09 = 0xBB67AE8584CAA73Bul;
                ulong v10 = 0x3C6EF372FE94F82Bul;
                ulong v11 = 0xA54FF53A5F1D36F1ul;
                ulong v12 = 0x510E527FADE682D1ul;
                ulong v13 = 0x9B05688C2B3E6C1Ful;
                ulong v14 = 0x1F83D9ABFB41BD6Bul;
                ulong v15 = 0x5BE0CD19137E2179ul;

                v12 ^= s->htf.t[0];
                v13 ^= s->htf.t[1];
                v14 ^= s->htf.f[0];

                //ROUND 1
                v00 += m00;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 32);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 24);

                v01 += m02;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 32);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 24);

                v02 += m04;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 32);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 24);

                v03 += m06;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 32);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 24);

                v02 += m05;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 16);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 63);

                v03 += m07;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 16);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 63);

                v00 += m01;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 16);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 63);

                v01 += m03;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 16);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 63);

                v00 += m08;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 32);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 24);

                v01 += m10;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 32);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 24);

                v02 += m12;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 32);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 24);

                v03 += m14;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 32);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 24);

                v02 += m13;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 16);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 63);

                v03 += m15;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 16);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 63);

                v00 += m09;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 16);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 63);

                v01 += m11;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 16);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 63);

                //ROUND 2
                v00 += m14;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 32);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 24);

                v01 += m04;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 32);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 24);

                v02 += m09;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 32);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 24);

                v03 += m13;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 32);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 24);

                v02 += m15;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 16);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 63);

                v03 += m06;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 16);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 63);

                v00 += m10;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 16);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 63);

                v01 += m08;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 16);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 63);

                v00 += m01;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 32);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 24);

                v01 += m00;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 32);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 24);

                v02 += m11;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 32);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 24);

                v03 += m05;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 32);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 24);

                v02 += m07;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 16);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 63);

                v03 += m03;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 16);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 63);

                v00 += m12;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 16);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 63);

                v01 += m02;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 16);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 63);

                //ROUND 3
                v00 += m11;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 32);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 24);

                v01 += m12;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 32);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 24);

                v02 += m05;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 32);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 24);

                v03 += m15;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 32);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 24);

                v02 += m02;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 16);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 63);

                v03 += m13;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 16);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 63);

                v00 += m08;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 16);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 63);

                v01 += m00;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 16);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 63);

                v00 += m10;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 32);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 24);

                v01 += m03;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 32);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 24);

                v02 += m07;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 32);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 24);

                v03 += m09;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 32);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 24);

                v02 += m01;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 16);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 63);

                v03 += m04;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 16);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 63);

                v00 += m14;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 16);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 63);

                v01 += m06;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 16);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 63);

                //ROUND 4
                v00 += m07;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 32);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 24);

                v01 += m03;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 32);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 24);

                v02 += m13;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 32);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 24);

                v03 += m11;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 32);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 24);

                v02 += m12;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 16);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 63);

                v03 += m14;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 16);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 63);

                v00 += m09;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 16);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 63);

                v01 += m01;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 16);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 63);

                v00 += m02;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 32);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 24);

                v01 += m05;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 32);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 24);

                v02 += m04;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 32);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 24);

                v03 += m15;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 32);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 24);

                v02 += m00;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 16);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 63);

                v03 += m08;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 16);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 63);

                v00 += m06;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 16);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 63);

                v01 += m10;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 16);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 63);

                //ROUND 5
                v00 += m09;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 32);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 24);

                v01 += m05;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 32);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 24);

                v02 += m02;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 32);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 24);

                v03 += m10;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 32);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 24);

                v02 += m04;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 16);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 63);

                v03 += m15;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 16);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 63);

                v00 += m00;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 16);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 63);

                v01 += m07;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 16);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 63);

                v00 += m14;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 32);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 24);

                v01 += m11;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 32);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 24);

                v02 += m06;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 32);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 24);

                v03 += m03;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 32);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 24);

                v02 += m08;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 16);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 63);

                v03 += m13;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 16);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 63);

                v00 += m01;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 16);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 63);

                v01 += m12;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 16);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 63);

                //ROUND 6
                v00 += m02;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 32);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 24);

                v01 += m06;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 32);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 24);

                v02 += m00;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 32);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 24);

                v03 += m08;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 32);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 24);

                v02 += m11;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 16);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 63);

                v03 += m03;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 16);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 63);

                v00 += m12;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 16);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 63);

                v01 += m10;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 16);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 63);

                v00 += m04;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 32);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 24);

                v01 += m07;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 32);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 24);

                v02 += m15;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 32);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 24);

                v03 += m01;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 32);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 24);

                v02 += m14;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 16);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 63);

                v03 += m09;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 16);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 63);

                v00 += m13;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 16);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 63);

                v01 += m05;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 16);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 63);

                //ROUND 7
                v00 += m12;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 32);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 24);

                v01 += m01;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 32);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 24);

                v02 += m14;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 32);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 24);

                v03 += m04;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 32);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 24);

                v02 += m13;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 16);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 63);

                v03 += m10;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 16);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 63);

                v00 += m05;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 16);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 63);

                v01 += m15;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 16);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 63);

                v00 += m00;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 32);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 24);

                v01 += m06;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 32);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 24);

                v02 += m09;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 32);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 24);

                v03 += m08;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 32);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 24);

                v02 += m02;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 16);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 63);

                v03 += m11;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 16);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 63);

                v00 += m07;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 16);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 63);

                v01 += m03;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 16);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 63);

                //ROUND 8
                v00 += m13;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 32);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 24);

                v01 += m07;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 32);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 24);

                v02 += m12;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 32);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 24);

                v03 += m03;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 32);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 24);

                v02 += m01;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 16);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 63);

                v03 += m09;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 16);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 63);

                v00 += m11;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 16);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 63);

                v01 += m14;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 16);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 63);

                v00 += m05;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 32);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 24);

                v01 += m15;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 32);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 24);

                v02 += m08;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 32);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 24);

                v03 += m02;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 32);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 24);

                v02 += m06;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 16);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 63);

                v03 += m10;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 16);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 63);

                v00 += m00;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 16);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 63);

                v01 += m04;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 16);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 63);

                //ROUND 9
                v00 += m06;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 32);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 24);

                v01 += m14;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 32);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 24);

                v02 += m11;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 32);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 24);

                v03 += m00;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 32);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 24);

                v02 += m03;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 16);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 63);

                v03 += m08;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 16);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 63);

                v00 += m15;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 16);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 63);

                v01 += m09;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 16);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 63);

                v00 += m12;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 32);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 24);

                v01 += m13;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 32);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 24);

                v02 += m01;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 32);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 24);

                v03 += m10;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 32);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 24);

                v02 += m04;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 16);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 63);

                v03 += m05;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 16);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 63);

                v00 += m02;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 16);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 63);

                v01 += m07;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 16);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 63);

                //ROUND 10
                v00 += m10;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 32);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 24);

                v01 += m08;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 32);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 24);

                v02 += m07;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 32);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 24);

                v03 += m01;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 32);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 24);

                v02 += m06;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 16);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 63);

                v03 += m05;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 16);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 63);

                v00 += m02;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 16);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 63);

                v01 += m04;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 16);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 63);

                v00 += m15;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 32);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 24);

                v01 += m09;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 32);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 24);

                v02 += m03;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 32);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 24);

                v03 += m13;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 32);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 24);

                v02 += m12;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 16);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 63);

                v03 += m00;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 16);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 63);

                v00 += m11;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 16);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 63);

                v01 += m14;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 16);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 63);

                //ROUND 11
                v00 += m00;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 32);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 24);

                v01 += m02;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 32);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 24);

                v02 += m04;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 32);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 24);

                v03 += m06;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 32);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 24);

                v02 += m05;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 16);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 63);

                v03 += m07;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 16);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 63);

                v00 += m01;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 16);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 63);

                v01 += m03;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 16);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 63);

                v00 += m08;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 32);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 24);

                v01 += m10;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 32);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 24);

                v02 += m12;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 32);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 24);

                v03 += m14;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 32);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 24);

                v02 += m13;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 16);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 63);

                v03 += m15;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 16);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 63);

                v00 += m09;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 16);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 63);

                v01 += m11;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 16);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 63);

                //ROUND 12
                v00 += m14;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 32);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 24);

                v01 += m04;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 32);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 24);

                v02 += m09;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 32);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 24);

                v03 += m13;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 32);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 24);

                v02 += m15;
                v02 += v06;
                v14 ^= v02;
                v14  = ror(v14, 16);
                v10 += v14;
                v06 ^= v10;
                v06  = ror(v06, 63);

                v03 += m06;
                v03 += v07;
                v15 ^= v03;
                v15  = ror(v15, 16);
                v11 += v15;
                v07 ^= v11;
                v07  = ror(v07, 63);

                v00 += m10;
                v00 += v04;
                v12 ^= v00;
                v12  = ror(v12, 16);
                v08 += v12;
                v04 ^= v08;
                v04  = ror(v04, 63);

                v01 += m08;
                v01 += v05;
                v13 ^= v01;
                v13  = ror(v13, 16);
                v09 += v13;
                v05 ^= v09;
                v05  = ror(v05, 63);

                v00 += m01;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 32);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 24);

                v01 += m00;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 32);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 24);

                v02 += m11;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 32);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 24);

                v03 += m05;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 32);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 24);

                v02 += m07;
                v02 += v07;
                v13 ^= v02;
                v13  = ror(v13, 16);
                v08 += v13;
                v07 ^= v08;
                v07  = ror(v07, 63);

                v03 += m03;
                v03 += v04;
                v14 ^= v03;
                v14  = ror(v14, 16);
                v09 += v14;
                v04 ^= v09;
                v04  = ror(v04, 63);

                v00 += m12;
                v00 += v05;
                v15 ^= v00;
                v15  = ror(v15, 16);
                v10 += v15;
                v05 ^= v10;
                v05  = ror(v05, 63);

                v01 += m02;
                v01 += v06;
                v12 ^= v01;
                v12  = ror(v12, 16);
                v11 += v12;
                v06 ^= v11;
                v06  = ror(v06, 63);

                s->htf.h[0] ^= v00 ^ v08;
                s->htf.h[1] ^= v01 ^ v09;
                s->htf.h[2] ^= v02 ^ v10;
                s->htf.h[3] ^= v03 ^ v11;
                s->htf.h[4] ^= v04 ^ v12;
                s->htf.h[5] ^= v05 ^ v13;
                s->htf.h[6] ^= v06 ^ v14;
                s->htf.h[7] ^= v07 ^ v15;
            }
        }
Esempio n. 8
0
        unsafe private static void mixScalar(Blake2bContext *s, ulong *m)
        {
            ulong m00 = m[00];
            ulong m01 = m[01];
            ulong m02 = m[02];
            ulong m03 = m[03];
            ulong m04 = m[04];
            ulong m05 = m[05];
            ulong m06 = m[06];
            ulong m07 = m[07];
            ulong m08 = m[08];
            ulong m09 = m[09];
            ulong m10 = m[10];
            ulong m11 = m[11];
            ulong m12 = m[12];
            ulong m13 = m[13];
            ulong m14 = m[14];
            ulong m15 = m[15];

            ulong v00 = s->h[0];
            ulong v01 = s->h[1];
            ulong v02 = s->h[2];
            ulong v03 = s->h[3];
            ulong v04 = s->h[4];
            ulong v05 = s->h[5];
            ulong v06 = s->h[6];
            ulong v07 = s->h[7];

            ulong v08 = 0x6A09E667F3BCC908ul;
            ulong v09 = 0xBB67AE8584CAA73Bul;
            ulong v10 = 0x3C6EF372FE94F82Bul;
            ulong v11 = 0xA54FF53A5F1D36F1ul;
            ulong v12 = 0x510E527FADE682D1ul;
            ulong v13 = 0x9B05688C2B3E6C1Ful;
            ulong v14 = 0x1F83D9ABFB41BD6Bul;
            ulong v15 = 0x5BE0CD19137E2179ul;

            v12 ^= s->t[0];
            v13 ^= s->t[1];
            v14 ^= s->f[0];

            //ROUND 1
            v00 += m00;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v01 += m02;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v02 += m04;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v03 += m06;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v02 += m05;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 63) ^ (v06 << 1);

            v03 += m07;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v00 += m01;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v01 += m03;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v00 += m08;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v01 += m10;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v02 += m12;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v03 += m14;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v02 += m13;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v03 += m15;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v00 += m09;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v01 += m11;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 63) ^ (v06 << 1);

            //ROUND 2
            v00 += m14;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v01 += m04;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v02 += m09;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v03 += m13;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v02 += m15;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 63) ^ (v06 << 1);

            v03 += m06;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v00 += m10;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v01 += m08;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v00 += m01;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v01 += m00;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v02 += m11;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v03 += m05;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v02 += m07;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v03 += m03;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v00 += m12;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v01 += m02;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 63) ^ (v06 << 1);

            //ROUND 3
            v00 += m11;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v01 += m12;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v02 += m05;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v03 += m15;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v02 += m02;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 63) ^ (v06 << 1);

            v03 += m13;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v00 += m08;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v01 += m00;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v00 += m10;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v01 += m03;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v02 += m07;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v03 += m09;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v02 += m01;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v03 += m04;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v00 += m14;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v01 += m06;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 63) ^ (v06 << 1);

            //ROUND 4
            v00 += m07;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v01 += m03;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v02 += m13;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v03 += m11;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v02 += m12;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 63) ^ (v06 << 1);

            v03 += m14;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v00 += m09;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v01 += m01;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v00 += m02;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v01 += m05;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v02 += m04;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v03 += m15;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v02 += m00;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v03 += m08;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v00 += m06;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v01 += m10;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 63) ^ (v06 << 1);

            //ROUND 5
            v00 += m09;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v01 += m05;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v02 += m02;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v03 += m10;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v02 += m04;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 63) ^ (v06 << 1);

            v03 += m15;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v00 += m00;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v01 += m07;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v00 += m14;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v01 += m11;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v02 += m06;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v03 += m03;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v02 += m08;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v03 += m13;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v00 += m01;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v01 += m12;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 63) ^ (v06 << 1);

            //ROUND 6
            v00 += m02;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v01 += m06;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v02 += m00;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v03 += m08;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v02 += m11;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 63) ^ (v06 << 1);

            v03 += m03;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v00 += m12;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v01 += m10;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v00 += m04;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v01 += m07;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v02 += m15;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v03 += m01;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v02 += m14;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v03 += m09;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v00 += m13;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v01 += m05;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 63) ^ (v06 << 1);

            //ROUND 7
            v00 += m12;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v01 += m01;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v02 += m14;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v03 += m04;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v02 += m13;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 63) ^ (v06 << 1);

            v03 += m10;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v00 += m05;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v01 += m15;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v00 += m00;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v01 += m06;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v02 += m09;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v03 += m08;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v02 += m02;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v03 += m11;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v00 += m07;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v01 += m03;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 63) ^ (v06 << 1);

            //ROUND 8
            v00 += m13;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v01 += m07;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v02 += m12;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v03 += m03;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v02 += m01;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 63) ^ (v06 << 1);

            v03 += m09;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v00 += m11;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v01 += m14;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v00 += m05;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v01 += m15;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v02 += m08;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v03 += m02;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v02 += m06;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v03 += m10;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v00 += m00;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v01 += m04;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 63) ^ (v06 << 1);

            //ROUND 9
            v00 += m06;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v01 += m14;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v02 += m11;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v03 += m00;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v02 += m03;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 63) ^ (v06 << 1);

            v03 += m08;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v00 += m15;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v01 += m09;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v00 += m12;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v01 += m13;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v02 += m01;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v03 += m10;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v02 += m04;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v03 += m05;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v00 += m02;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v01 += m07;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 63) ^ (v06 << 1);

            //ROUND 10
            v00 += m10;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v01 += m08;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v02 += m07;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v03 += m01;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v02 += m06;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 63) ^ (v06 << 1);

            v03 += m05;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v00 += m02;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v01 += m04;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v00 += m15;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v01 += m09;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v02 += m03;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v03 += m13;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v02 += m12;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v03 += m00;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v00 += m11;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v01 += m14;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 63) ^ (v06 << 1);

            //ROUND 11
            v00 += m00;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v01 += m02;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v02 += m04;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v03 += m06;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v02 += m05;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 63) ^ (v06 << 1);

            v03 += m07;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v00 += m01;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v01 += m03;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v00 += m08;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v01 += m10;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v02 += m12;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v03 += m14;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v02 += m13;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v03 += m15;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v00 += m09;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v01 += m11;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 63) ^ (v06 << 1);

            //ROUND 12
            v00 += m14;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v01 += m04;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v02 += m09;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v03 += m13;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v02 += m15;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 63) ^ (v06 << 1);

            v03 += m06;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v00 += m10;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v01 += m08;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v00 += m01;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 32) ^ (v15 << 32);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 24) ^ (v05 << 40);

            v01 += m00;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 32) ^ (v12 << 32);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 24) ^ (v06 << 40);

            v02 += m11;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 32) ^ (v13 << 32);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 24) ^ (v07 << 40);

            v03 += m05;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 32) ^ (v14 << 32);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 24) ^ (v04 << 40);

            v02 += m07;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 48);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 63) ^ (v07 << 1);

            v03 += m03;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 48);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 63) ^ (v04 << 1);

            v00 += m12;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 48);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 63) ^ (v05 << 1);

            v01 += m02;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 48);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 63) ^ (v06 << 1);

            s->h[0] ^= v00 ^ v08;
            s->h[1] ^= v01 ^ v09;
            s->h[2] ^= v02 ^ v10;
            s->h[3] ^= v03 ^ v11;
            s->h[4] ^= v04 ^ v12;
            s->h[5] ^= v05 ^ v13;
            s->h[6] ^= v06 ^ v14;
            s->h[7] ^= v07 ^ v15;
        }