Beispiel #1
0
        unsafe private static void compress(Blake2sContext *s, byte *input)
        {
            uint *m = (uint *)input;

#if FAST_SPAN
            if (!BitConverter.IsLittleEndian)
            {
                var span = new ReadOnlySpan <byte>(input, BlockBytes);
                m = (uint *)s->b;
                for (int i = 0; i < BlockWords; i++)
                {
                    m[i] = BinaryPrimitives.ReadUInt32LittleEndian(span.Slice(i * WordSize, WordSize));
                }
            }
#endif

#if USE_INTRINSICS
            if (Sse41.IsSupported)
            {
                mixSse41(s, m);
            }
            else
#endif
            mixScalar(s, m);
        }
Beispiel #2
0
        unsafe private static void mixScalar(Blake2sContext *s, uint *m)
        {
            uint m00 = m[00];
            uint m01 = m[01];
            uint m02 = m[02];
            uint m03 = m[03];
            uint m04 = m[04];
            uint m05 = m[05];
            uint m06 = m[06];
            uint m07 = m[07];
            uint m08 = m[08];
            uint m09 = m[09];
            uint m10 = m[10];
            uint m11 = m[11];
            uint m12 = m[12];
            uint m13 = m[13];
            uint m14 = m[14];
            uint m15 = m[15];

            uint v00 = s->h[0];
            uint v01 = s->h[1];
            uint v02 = s->h[2];
            uint v03 = s->h[3];
            uint v04 = s->h[4];
            uint v05 = s->h[5];
            uint v06 = s->h[6];
            uint v07 = s->h[7];

            uint v08 = 0x6A09E667u;
            uint v09 = 0xBB67AE85u;
            uint v10 = 0x3C6EF372u;
            uint v11 = 0xA54FF53Au;
            uint v12 = 0x510E527Fu;
            uint v13 = 0x9B05688Cu;
            uint v14 = 0x1F83D9ABu;
            uint v15 = 0x5BE0CD19u;

            v12 ^= s->t[0];
            v13 ^= s->t[1];
            v14 ^= s->f[0];

            //ROUND 1
            v00 += m00;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v01 += m02;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v02 += m04;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v03 += m06;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v02 += m05;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 7) ^ (v06 << 25);

            v03 += m07;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v00 += m01;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v01 += m03;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v00 += m08;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v01 += m10;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v02 += m12;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v03 += m14;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v02 += m13;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v03 += m15;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v00 += m09;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v01 += m11;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 7) ^ (v06 << 25);

            //ROUND 2
            v00 += m14;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v01 += m04;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v02 += m09;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v03 += m13;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v02 += m15;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 7) ^ (v06 << 25);

            v03 += m06;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v00 += m10;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v01 += m08;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v00 += m01;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v01 += m00;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v02 += m11;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v03 += m05;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v02 += m07;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v03 += m03;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v00 += m12;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v01 += m02;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 7) ^ (v06 << 25);

            //ROUND 3
            v00 += m11;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v01 += m12;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v02 += m05;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v03 += m15;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v02 += m02;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 7) ^ (v06 << 25);

            v03 += m13;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v00 += m08;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v01 += m00;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v00 += m10;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v01 += m03;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v02 += m07;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v03 += m09;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v02 += m01;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v03 += m04;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v00 += m14;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v01 += m06;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 7) ^ (v06 << 25);

            //ROUND 4
            v00 += m07;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v01 += m03;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v02 += m13;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v03 += m11;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v02 += m12;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 7) ^ (v06 << 25);

            v03 += m14;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v00 += m09;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v01 += m01;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v00 += m02;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v01 += m05;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v02 += m04;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v03 += m15;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v02 += m00;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v03 += m08;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v00 += m06;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v01 += m10;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 7) ^ (v06 << 25);

            //ROUND 5
            v00 += m09;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v01 += m05;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v02 += m02;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v03 += m10;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v02 += m04;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 7) ^ (v06 << 25);

            v03 += m15;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v00 += m00;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v01 += m07;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v00 += m14;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v01 += m11;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v02 += m06;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v03 += m03;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v02 += m08;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v03 += m13;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v00 += m01;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v01 += m12;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 7) ^ (v06 << 25);

            //ROUND 6
            v00 += m02;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v01 += m06;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v02 += m00;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v03 += m08;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v02 += m11;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 7) ^ (v06 << 25);

            v03 += m03;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v00 += m12;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v01 += m10;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v00 += m04;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v01 += m07;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v02 += m15;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v03 += m01;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v02 += m14;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v03 += m09;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v00 += m13;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v01 += m05;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 7) ^ (v06 << 25);

            //ROUND 7
            v00 += m12;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v01 += m01;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v02 += m14;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v03 += m04;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v02 += m13;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 7) ^ (v06 << 25);

            v03 += m10;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v00 += m05;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v01 += m15;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v00 += m00;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v01 += m06;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v02 += m09;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v03 += m08;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v02 += m02;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v03 += m11;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v00 += m07;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v01 += m03;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 7) ^ (v06 << 25);

            //ROUND 8
            v00 += m13;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v01 += m07;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v02 += m12;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v03 += m03;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v02 += m01;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 7) ^ (v06 << 25);

            v03 += m09;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v00 += m11;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v01 += m14;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v00 += m05;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v01 += m15;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v02 += m08;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v03 += m02;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v02 += m06;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v03 += m10;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v00 += m00;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v01 += m04;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 7) ^ (v06 << 25);

            //ROUND 9
            v00 += m06;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v01 += m14;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v02 += m11;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v03 += m00;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v02 += m03;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 7) ^ (v06 << 25);

            v03 += m08;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v00 += m15;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v01 += m09;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v00 += m12;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v01 += m13;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v02 += m01;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v03 += m10;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v02 += m04;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v03 += m05;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v00 += m02;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v01 += m07;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 7) ^ (v06 << 25);

            //ROUND 10
            v00 += m10;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v01 += m08;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v02 += m07;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v03 += m01;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v02 += m06;
            v02 += v06;
            v14 ^= v02;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v10 += v14;
            v06 ^= v10;
            v06  = (v06 >> 7) ^ (v06 << 25);

            v03 += m05;
            v03 += v07;
            v15 ^= v03;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v11 += v15;
            v07 ^= v11;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v00 += m02;
            v00 += v04;
            v12 ^= v00;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v08 += v12;
            v04 ^= v08;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v01 += m04;
            v01 += v05;
            v13 ^= v01;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v09 += v13;
            v05 ^= v09;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v00 += m15;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 16) ^ (v15 << 16);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 12) ^ (v05 << 20);

            v01 += m09;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 16) ^ (v12 << 16);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 12) ^ (v06 << 20);

            v02 += m03;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 16) ^ (v13 << 16);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 12) ^ (v07 << 20);

            v03 += m13;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 16) ^ (v14 << 16);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 12) ^ (v04 << 20);

            v02 += m12;
            v02 += v07;
            v13 ^= v02;
            v13  = (v13 >> 8) ^ (v13 << 24);
            v08 += v13;
            v07 ^= v08;
            v07  = (v07 >> 7) ^ (v07 << 25);

            v03 += m00;
            v03 += v04;
            v14 ^= v03;
            v14  = (v14 >> 8) ^ (v14 << 24);
            v09 += v14;
            v04 ^= v09;
            v04  = (v04 >> 7) ^ (v04 << 25);

            v00 += m11;
            v00 += v05;
            v15 ^= v00;
            v15  = (v15 >> 8) ^ (v15 << 24);
            v10 += v15;
            v05 ^= v10;
            v05  = (v05 >> 7) ^ (v05 << 25);

            v01 += m14;
            v01 += v06;
            v12 ^= v01;
            v12  = (v12 >> 8) ^ (v12 << 24);
            v11 += v12;
            v06 ^= v11;
            v06  = (v06 >> 7) ^ (v06 << 25);

            s->h[0] ^= v00 ^ v08;
            s->h[1] ^= v01 ^ v09;
            s->h[2] ^= v02 ^ v10;
            s->h[3] ^= v03 ^ v11;
            s->h[4] ^= v04 ^ v12;
            s->h[5] ^= v05 ^ v13;
            s->h[6] ^= v06 ^ v14;
            s->h[7] ^= v07 ^ v15;
        }
Beispiel #3
0
        unsafe private static void mixSse41(Blake2sContext *s, uint *m)
        {
            var row1 = Sse2.LoadVector128(s->h);
            var row2 = Sse2.LoadVector128(s->h + 4);

            var row3 = v128iv0;
            var row4 = v128iv1;

            row4 = Sse2.Xor(row4, Sse2.LoadVector128(s->t));             // reads into f[] as well

            var m0 = Sse2.LoadVector128(m);
            var m1 = Sse2.LoadVector128(m + 4);
            var m2 = Sse2.LoadVector128(m + 8);
            var m3 = Sse2.LoadVector128(m + 12);

            var r16 = v128rm0;
            var r8  = v128rm1;

            //ROUND 1
#if OLD_INTRINSICS
            var b0 = Sse.StaticCast <float, uint>(Sse.Shuffle(Sse.StaticCast <uint, float>(m0), Sse.StaticCast <uint, float>(m1), 0b_10_00_10_00));
#else
            var b0 = Sse.Shuffle(m0.AsSingle(), m1.AsSingle(), 0b_10_00_10_00).AsUInt32();
#endif

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

#if OLD_INTRINSICS
            b0 = Sse.StaticCast <float, uint>(Sse.Shuffle(Sse.StaticCast <uint, float>(m0), Sse.StaticCast <uint, float>(m1), 0b_11_01_11_01));
#else
            b0 = Sse.Shuffle(m0.AsSingle(), m1.AsSingle(), 0b_11_01_11_01).AsUInt32();
#endif

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

#if OLD_INTRINSICS
            b0 = Sse.StaticCast <float, uint>(Sse.Shuffle(Sse.StaticCast <uint, float>(m2), Sse.StaticCast <uint, float>(m3), 0b_10_00_10_00));
#else
            b0 = Sse.Shuffle(m2.AsSingle(), m3.AsSingle(), 0b_10_00_10_00).AsUInt32();
#endif

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

#if OLD_INTRINSICS
            b0 = Sse.StaticCast <float, uint>(Sse.Shuffle(Sse.StaticCast <uint, float>(m2), Sse.StaticCast <uint, float>(m3), 0b_11_01_11_01));
#else
            b0 = Sse.Shuffle(m2.AsSingle(), m3.AsSingle(), 0b_11_01_11_01).AsUInt32();
#endif

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            //ROUND 2
#if OLD_INTRINSICS
            var t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m2), 0b_00_00_11_00));
#else
            var t0 = Sse41.Blend(m1.AsUInt16(), m2.AsUInt16(), 0b_00_00_11_00).AsUInt32();
#endif
            var t1 = Sse2.ShiftLeftLogical128BitLane(m3, 4);
#if OLD_INTRINSICS
            var t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_11_11_00_00));
#else
            var t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_10_01_00_11);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.Shuffle(m2, 0b_00_00_10_00);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m3), 0b_11_00_00_00));
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_11_11_00_00));
#else
            t1 = Sse41.Blend(m1.AsUInt16(), m3.AsUInt16(), 0b_11_00_00_00).AsUInt32();
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_10_11_00_01);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

            t0 = Sse2.ShiftLeftLogical128BitLane(m1, 4);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m2), Sse.StaticCast <uint, ushort>(t0), 0b_00_11_00_00));
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(t1), 0b_11_11_00_00));
#else
            t1 = Sse41.Blend(m2.AsUInt16(), t0.AsUInt16(), 0b_00_11_00_00).AsUInt32();
            t2 = Sse41.Blend(m0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_10_11_00_01);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.UnpackHigh(m0, m1);
            t1 = Sse2.ShiftLeftLogical128BitLane(m3, 4);
#if OLD_INTRINSICS
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_11_00));
#else
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_10_11_00_01);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            //ROUND 3
            t0 = Sse2.UnpackHigh(m2, m3);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m3), Sse.StaticCast <uint, ushort>(m1), 0b_00_00_11_00));
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_11_11));
#else
            t1 = Sse41.Blend(m3.AsUInt16(), m1.AsUInt16(), 0b_00_00_11_00).AsUInt32();
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_11).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_11_01_00_10);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.UnpackLow(m2, m0);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(m0), 0b_11_11_00_00));
#else
            t1 = Sse41.Blend(t0.AsUInt16(), m0.AsUInt16(), 0b_11_11_00_00).AsUInt32();
#endif
            t2 = Sse2.ShiftLeftLogical128BitLane(m3, 8);
#if OLD_INTRINSICS
            b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t2), 0b_11_00_00_00));
#else
            b0 = Sse41.Blend(t1.AsUInt16(), t2.AsUInt16(), 0b_11_00_00_00).AsUInt32();
#endif

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m2), 0b_00_11_11_00));
#else
            t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_11_11_00).AsUInt32();
#endif
            t1 = Sse2.ShiftRightLogical128BitLane(m1, 12);
#if OLD_INTRINSICS
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_00_11));
#else
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_01_00_11_10);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.ShiftLeftLogical128BitLane(m3, 4);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m1), 0b_00_11_00_11));
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_11_00_00_00));
#else
            t1 = Sse41.Blend(m0.AsUInt16(), m1.AsUInt16(), 0b_00_11_00_11).AsUInt32();
            t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_00_01_10_11);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            //ROUND 4
            t0 = Sse2.UnpackHigh(m0, m1);
            t1 = Sse2.UnpackHigh(t0, m2);
#if OLD_INTRINSICS
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_11_00));
#else
            t2 = Sse41.Blend(t1.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_11_01_00_10);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.ShiftLeftLogical128BitLane(m2, 8);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m3), Sse.StaticCast <uint, ushort>(m0), 0b_00_00_11_00));
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_11_00_00_00));
#else
            t1 = Sse41.Blend(m3.AsUInt16(), m0.AsUInt16(), 0b_00_00_11_00).AsUInt32();
            t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_10_00_01_11);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m1), 0b_00_00_11_11));
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(m3), 0b_11_00_00_00));
#else
            t0 = Sse41.Blend(m0.AsUInt16(), m1.AsUInt16(), 0b_00_00_11_11).AsUInt32();
            t1 = Sse41.Blend(t0.AsUInt16(), m3.AsUInt16(), 0b_11_00_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t1, 0b_11_00_01_10);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.UnpackLow(m0, m2);
            t1 = Sse2.UnpackHigh(m1, m2);
#if OLD_INTRINSICS
            b0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(t1), Sse.StaticCast <uint, ulong>(t0)));
#else
            b0 = Sse2.UnpackLow(t1.AsUInt64(), t0.AsUInt64()).AsUInt32();
#endif

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            //ROUND 5
#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m1), Sse.StaticCast <uint, ulong>(m2)));
            t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(m2)));
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_00_11));
#else
            t0 = Sse2.UnpackLow(m1.AsUInt64(), m2.AsUInt64()).AsUInt32();
            t1 = Sse2.UnpackHigh(m0.AsUInt64(), m2.AsUInt64()).AsUInt32();
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_10_00_01_11);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m1), Sse.StaticCast <uint, ulong>(m3)));
            t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(m1)));
            b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_00_11));
#else
            t0 = Sse2.UnpackHigh(m1.AsUInt64(), m3.AsUInt64()).AsUInt32();
            t1 = Sse2.UnpackLow(m0.AsUInt64(), m1.AsUInt64()).AsUInt32();
            b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32();
#endif

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m3), Sse.StaticCast <uint, ulong>(m1)));
            t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m2), Sse.StaticCast <uint, ulong>(m0)));
            b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_00_11_00_11));
#else
            t0 = Sse2.UnpackHigh(m3.AsUInt64(), m1.AsUInt64()).AsUInt32();
            t1 = Sse2.UnpackHigh(m2.AsUInt64(), m0.AsUInt64()).AsUInt32();
            b0 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_11_00_11).AsUInt32();
#endif

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m2), 0b_00_00_00_11));
#else
            t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_00_00_11).AsUInt32();
#endif
            t1 = Sse2.ShiftLeftLogical128BitLane(t0, 8);
#if OLD_INTRINSICS
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_11_11));
#else
            t2 = Sse41.Blend(t1.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_11).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_01_10_00_11);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            //ROUND 6
            t0 = Sse2.UnpackHigh(m0, m1);
            t1 = Sse2.UnpackLow(m0, m2);
#if OLD_INTRINSICS
            b0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(t0), Sse.StaticCast <uint, ulong>(t1)));
#else
            b0 = Sse2.UnpackLow(t0.AsUInt64(), t1.AsUInt64()).AsUInt32();
#endif

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.ShiftRightLogical128BitLane(m2, 4);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_00_11));
            b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_00_11_11_00));
#else
            t1 = Sse41.Blend(m0.AsUInt16(), m3.AsUInt16(), 0b_00_00_00_11).AsUInt32();
            b0 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_11_11_00).AsUInt32();
#endif

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m0), 0b_00_00_11_00));
#else
            t0 = Sse41.Blend(m1.AsUInt16(), m0.AsUInt16(), 0b_00_00_11_00).AsUInt32();
#endif
            t1 = Sse2.ShiftRightLogical128BitLane(m3, 4);
#if OLD_INTRINSICS
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_00_00));
#else
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_01_10_11_00);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m1), Sse.StaticCast <uint, ulong>(m2)));
#else
            t0 = Sse2.UnpackLow(m1.AsUInt64(), m2.AsUInt64()).AsUInt32();
#endif
            t1 = Sse2.Shuffle(m3, 0b_00_10_00_01);
#if OLD_INTRINSICS
            b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_00_11));
#else
            b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32();
#endif

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            //ROUND 7
            t0 = Sse2.ShiftLeftLogical128BitLane(m1, 12);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m3), 0b_00_11_00_11));
            b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_11_00_00_00));
#else
            t1 = Sse41.Blend(m0.AsUInt16(), m3.AsUInt16(), 0b_00_11_00_11).AsUInt32();
            b0 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32();
#endif

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m3), Sse.StaticCast <uint, ushort>(m2), 0b_00_11_00_00));
#else
            t0 = Sse41.Blend(m3.AsUInt16(), m2.AsUInt16(), 0b_00_11_00_00).AsUInt32();
#endif
            t1 = Sse2.ShiftRightLogical128BitLane(m1, 4);
#if OLD_INTRINSICS
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_00_11));
#else
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_10_01_11_00);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(m2)));
#else
            t0 = Sse2.UnpackLow(m0.AsUInt64(), m2.AsUInt64()).AsUInt32();
#endif
            t1 = Sse2.ShiftRightLogical128BitLane(m1, 4);
#if OLD_INTRINSICS
            b0 = Sse2.Shuffle(Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_11_00)), 0b_10_11_01_00);
#else
            b0 = Sse2.Shuffle(Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_00).AsUInt32(), 0b_10_11_01_00);
#endif

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.UnpackHigh(m1, m2);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(t0)));
#else
            t1 = Sse2.UnpackHigh(m0.AsUInt64(), t0.AsUInt64()).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t1, 0b_11_00_01_10);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            //ROUND 8
            t0 = Sse2.UnpackHigh(m0, m1);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_11_11));
#else
            t1 = Sse41.Blend(t0.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_11).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t1, 0b_10_00_11_01);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m2), Sse.StaticCast <uint, ushort>(m3), 0b_00_11_00_00));
#else
            t0 = Sse41.Blend(m2.AsUInt16(), m3.AsUInt16(), 0b_00_11_00_00).AsUInt32();
#endif
            t1 = Sse2.ShiftRightLogical128BitLane(m0, 4);
#if OLD_INTRINSICS
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_00_11));
#else
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_01_00_10_11);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(m3)));
            t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m1), Sse.StaticCast <uint, ulong>(m2)));
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_11_00));
#else
            t0 = Sse2.UnpackHigh(m0.AsUInt64(), m3.AsUInt64()).AsUInt32();
            t1 = Sse2.UnpackLow(m1.AsUInt64(), m2.AsUInt64()).AsUInt32();
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_11_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_00_10_11_01);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.UnpackLow(m0, m1);
            t1 = Sse2.UnpackHigh(m1, m2);
#if OLD_INTRINSICS
            b0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(t0), Sse.StaticCast <uint, ulong>(t1)));
#else
            b0 = Sse2.UnpackLow(t0.AsUInt64(), t1.AsUInt64()).AsUInt32();
#endif

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            //ROUND 9
            t0 = Sse2.UnpackHigh(m1, m3);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(t0), Sse.StaticCast <uint, ulong>(m0)));
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(m2), 0b_11_00_00_00));
            b0 = Sse.StaticCast <ushort, uint>(Sse2.ShuffleHigh(Sse.StaticCast <uint, ushort>(t2), 0b_01_00_11_10));
#else
            t1 = Sse2.UnpackLow(t0.AsUInt64(), m0.AsUInt64()).AsUInt32();
            t2 = Sse41.Blend(t1.AsUInt16(), m2.AsUInt16(), 0b_11_00_00_00).AsUInt32();
            b0 = Sse2.ShuffleHigh(t2.AsUInt16(), 0b_01_00_11_10).AsUInt32();
#endif

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.UnpackHigh(m0, m3);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m2), Sse.StaticCast <uint, ushort>(t0), 0b_11_11_00_00));
#else
            t1 = Sse41.Blend(m2.AsUInt16(), t0.AsUInt16(), 0b_11_11_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t1, 0b_00_10_01_11);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m2), Sse.StaticCast <uint, ushort>(m0), 0b_00_00_11_00));
#else
            t0 = Sse41.Blend(m2.AsUInt16(), m0.AsUInt16(), 0b_00_00_11_00).AsUInt32();
#endif
            t1 = Sse2.ShiftLeftLogical128BitLane(t0, 4);
#if OLD_INTRINSICS
            b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_11_11));
#else
            b0 = Sse41.Blend(t1.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_11).AsUInt32();
#endif

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m0), 0b_00_11_00_00));
#else
            t0 = Sse41.Blend(m1.AsUInt16(), m0.AsUInt16(), 0b_00_11_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t0, 0b_01_00_11_10);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            //ROUND 10
#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m2), 0b_00_00_00_11));
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m2), 0b_00_11_00_00));
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_00_00_11_11));
#else
            t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_00_00_11).AsUInt32();
            t1 = Sse41.Blend(m1.AsUInt16(), m2.AsUInt16(), 0b_00_11_00_00).AsUInt32();
            t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_00_11_11).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_01_11_00_10);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.ShiftLeftLogical128BitLane(m0, 4);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(t0), 0b_11_00_00_00));
#else
            t1 = Sse41.Blend(m1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t1, 0b_01_10_00_11);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

            t0 = Sse2.UnpackHigh(m0, m3);
            t1 = Sse2.UnpackLow(m2, m3);
#if OLD_INTRINSICS
            t2 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(t0), Sse.StaticCast <uint, ulong>(t1)));
#else
            t2 = Sse2.UnpackHigh(t0.AsUInt64(), t1.AsUInt64()).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_11_00_10_01);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m3), Sse.StaticCast <uint, ushort>(m2), 0b_11_00_00_00));
#else
            t0 = Sse41.Blend(m3.AsUInt16(), m2.AsUInt16(), 0b_11_00_00_00).AsUInt32();
#endif
            t1 = Sse2.UnpackLow(m0, m3);
#if OLD_INTRINSICS
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_11_11));
#else
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_11).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_00_01_10_11);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            row1 = Sse2.Xor(row1, row3);
            row2 = Sse2.Xor(row2, row4);
            row1 = Sse2.Xor(row1, Sse2.LoadVector128(s->h));
            row2 = Sse2.Xor(row2, Sse2.LoadVector128(s->h + 4));
            Sse2.Store(s->h, row1);
            Sse2.Store(s->h + 4, row2);
        }