unsafe private static void compress(Blake2sContext *s, byte *input) { uint *m = (uint *)input; #if FAST_SPAN if (!BitConverter.IsLittleEndian) { var span = new ReadOnlySpan <byte>(input, BlockBytes); m = (uint *)s->b; for (int i = 0; i < BlockWords; i++) { m[i] = BinaryPrimitives.ReadUInt32LittleEndian(span.Slice(i * WordSize, WordSize)); } } #endif #if USE_INTRINSICS if (Sse41.IsSupported) { mixSse41(s, m); } else #endif mixScalar(s, m); }
unsafe private static void mixScalar(Blake2sContext *s, uint *m) { uint m00 = m[00]; uint m01 = m[01]; uint m02 = m[02]; uint m03 = m[03]; uint m04 = m[04]; uint m05 = m[05]; uint m06 = m[06]; uint m07 = m[07]; uint m08 = m[08]; uint m09 = m[09]; uint m10 = m[10]; uint m11 = m[11]; uint m12 = m[12]; uint m13 = m[13]; uint m14 = m[14]; uint m15 = m[15]; uint v00 = s->h[0]; uint v01 = s->h[1]; uint v02 = s->h[2]; uint v03 = s->h[3]; uint v04 = s->h[4]; uint v05 = s->h[5]; uint v06 = s->h[6]; uint v07 = s->h[7]; uint v08 = 0x6A09E667u; uint v09 = 0xBB67AE85u; uint v10 = 0x3C6EF372u; uint v11 = 0xA54FF53Au; uint v12 = 0x510E527Fu; uint v13 = 0x9B05688Cu; uint v14 = 0x1F83D9ABu; uint v15 = 0x5BE0CD19u; v12 ^= s->t[0]; v13 ^= s->t[1]; v14 ^= s->f[0]; //ROUND 1 v00 += m00; v00 += v04; v12 ^= v00; v12 = (v12 >> 16) ^ (v12 << 16); v08 += v12; v04 ^= v08; v04 = (v04 >> 12) ^ (v04 << 20); v01 += m02; v01 += v05; v13 ^= v01; v13 = (v13 >> 16) ^ (v13 << 16); v09 += v13; v05 ^= v09; v05 = (v05 >> 12) ^ (v05 << 20); v02 += m04; v02 += v06; v14 ^= v02; v14 = (v14 >> 16) ^ (v14 << 16); v10 += v14; v06 ^= v10; v06 = (v06 >> 12) ^ (v06 << 20); v03 += m06; v03 += v07; v15 ^= v03; v15 = (v15 >> 16) ^ (v15 << 16); v11 += v15; v07 ^= v11; v07 = (v07 >> 12) ^ (v07 << 20); v02 += m05; v02 += v06; v14 ^= v02; v14 = (v14 >> 8) ^ (v14 << 24); v10 += v14; v06 ^= v10; v06 = (v06 >> 7) ^ (v06 << 25); v03 += m07; v03 += v07; v15 ^= v03; v15 = (v15 >> 8) ^ (v15 << 24); v11 += v15; v07 ^= v11; v07 = (v07 >> 7) ^ (v07 << 25); v00 += m01; v00 += v04; v12 ^= v00; v12 = (v12 >> 8) ^ (v12 << 24); v08 += v12; v04 ^= v08; v04 = (v04 >> 7) ^ (v04 << 25); v01 += m03; v01 += v05; v13 ^= v01; v13 = (v13 >> 8) ^ (v13 << 24); v09 += v13; v05 ^= v09; v05 = (v05 >> 7) ^ (v05 << 25); v00 += m08; v00 += v05; v15 ^= v00; v15 = (v15 >> 16) ^ (v15 << 16); v10 += v15; v05 ^= v10; v05 = (v05 >> 12) ^ (v05 << 20); v01 += m10; v01 += v06; v12 ^= v01; v12 = (v12 >> 16) ^ (v12 << 16); v11 += v12; v06 ^= v11; v06 = (v06 >> 12) ^ (v06 << 20); v02 += m12; v02 += v07; v13 ^= v02; v13 = (v13 >> 16) ^ (v13 << 16); v08 += v13; v07 ^= v08; v07 = (v07 >> 12) ^ (v07 << 20); v03 += m14; v03 += v04; v14 ^= v03; v14 = (v14 >> 16) ^ (v14 << 16); v09 += v14; v04 ^= v09; v04 = (v04 >> 12) ^ (v04 << 20); v02 += m13; v02 += v07; v13 ^= v02; v13 = (v13 >> 8) ^ (v13 << 24); v08 += v13; v07 ^= v08; v07 = (v07 >> 7) ^ (v07 << 25); v03 += m15; v03 += v04; v14 ^= v03; v14 = (v14 >> 8) ^ (v14 << 24); v09 += v14; v04 ^= v09; v04 = (v04 >> 7) ^ (v04 << 25); v00 += m09; v00 += v05; v15 ^= v00; v15 = (v15 >> 8) ^ (v15 << 24); v10 += v15; v05 ^= v10; v05 = (v05 >> 7) ^ (v05 << 25); v01 += m11; v01 += v06; v12 ^= v01; v12 = (v12 >> 8) ^ (v12 << 24); v11 += v12; v06 ^= v11; v06 = (v06 >> 7) ^ (v06 << 25); //ROUND 2 v00 += m14; v00 += v04; v12 ^= v00; v12 = (v12 >> 16) ^ (v12 << 16); v08 += v12; v04 ^= v08; v04 = (v04 >> 12) ^ (v04 << 20); v01 += m04; v01 += v05; v13 ^= v01; v13 = (v13 >> 16) ^ (v13 << 16); v09 += v13; v05 ^= v09; v05 = (v05 >> 12) ^ (v05 << 20); v02 += m09; v02 += v06; v14 ^= v02; v14 = (v14 >> 16) ^ (v14 << 16); v10 += v14; v06 ^= v10; v06 = (v06 >> 12) ^ (v06 << 20); v03 += m13; v03 += v07; v15 ^= v03; v15 = (v15 >> 16) ^ (v15 << 16); v11 += v15; v07 ^= v11; v07 = (v07 >> 12) ^ (v07 << 20); v02 += m15; v02 += v06; v14 ^= v02; v14 = (v14 >> 8) ^ (v14 << 24); v10 += v14; v06 ^= v10; v06 = (v06 >> 7) ^ (v06 << 25); v03 += m06; v03 += v07; v15 ^= v03; v15 = (v15 >> 8) ^ (v15 << 24); v11 += v15; v07 ^= v11; v07 = (v07 >> 7) ^ (v07 << 25); v00 += m10; v00 += v04; v12 ^= v00; v12 = (v12 >> 8) ^ (v12 << 24); v08 += v12; v04 ^= v08; v04 = (v04 >> 7) ^ (v04 << 25); v01 += m08; v01 += v05; v13 ^= v01; v13 = (v13 >> 8) ^ (v13 << 24); v09 += v13; v05 ^= v09; v05 = (v05 >> 7) ^ (v05 << 25); v00 += m01; v00 += v05; v15 ^= v00; v15 = (v15 >> 16) ^ (v15 << 16); v10 += v15; v05 ^= v10; v05 = (v05 >> 12) ^ (v05 << 20); v01 += m00; v01 += v06; v12 ^= v01; v12 = (v12 >> 16) ^ (v12 << 16); v11 += v12; v06 ^= v11; v06 = (v06 >> 12) ^ (v06 << 20); v02 += m11; v02 += v07; v13 ^= v02; v13 = (v13 >> 16) ^ (v13 << 16); v08 += v13; v07 ^= v08; v07 = (v07 >> 12) ^ (v07 << 20); v03 += m05; v03 += v04; v14 ^= v03; v14 = (v14 >> 16) ^ (v14 << 16); v09 += v14; v04 ^= v09; v04 = (v04 >> 12) ^ (v04 << 20); v02 += m07; v02 += v07; v13 ^= v02; v13 = (v13 >> 8) ^ (v13 << 24); v08 += v13; v07 ^= v08; v07 = (v07 >> 7) ^ (v07 << 25); v03 += m03; v03 += v04; v14 ^= v03; v14 = (v14 >> 8) ^ (v14 << 24); v09 += v14; v04 ^= v09; v04 = (v04 >> 7) ^ (v04 << 25); v00 += m12; v00 += v05; v15 ^= v00; v15 = (v15 >> 8) ^ (v15 << 24); v10 += v15; v05 ^= v10; v05 = (v05 >> 7) ^ (v05 << 25); v01 += m02; v01 += v06; v12 ^= v01; v12 = (v12 >> 8) ^ (v12 << 24); v11 += v12; v06 ^= v11; v06 = (v06 >> 7) ^ (v06 << 25); //ROUND 3 v00 += m11; v00 += v04; v12 ^= v00; v12 = (v12 >> 16) ^ (v12 << 16); v08 += v12; v04 ^= v08; v04 = (v04 >> 12) ^ (v04 << 20); v01 += m12; v01 += v05; v13 ^= v01; v13 = (v13 >> 16) ^ (v13 << 16); v09 += v13; v05 ^= v09; v05 = (v05 >> 12) ^ (v05 << 20); v02 += m05; v02 += v06; v14 ^= v02; v14 = (v14 >> 16) ^ (v14 << 16); v10 += v14; v06 ^= v10; v06 = (v06 >> 12) ^ (v06 << 20); v03 += m15; v03 += v07; v15 ^= v03; v15 = (v15 >> 16) ^ (v15 << 16); v11 += v15; v07 ^= v11; v07 = (v07 >> 12) ^ (v07 << 20); v02 += m02; v02 += v06; v14 ^= v02; v14 = (v14 >> 8) ^ (v14 << 24); v10 += v14; v06 ^= v10; v06 = (v06 >> 7) ^ (v06 << 25); v03 += m13; v03 += v07; v15 ^= v03; v15 = (v15 >> 8) ^ (v15 << 24); v11 += v15; v07 ^= v11; v07 = (v07 >> 7) ^ (v07 << 25); v00 += m08; v00 += v04; v12 ^= v00; v12 = (v12 >> 8) ^ (v12 << 24); v08 += v12; v04 ^= v08; v04 = (v04 >> 7) ^ (v04 << 25); v01 += m00; v01 += v05; v13 ^= v01; v13 = (v13 >> 8) ^ (v13 << 24); v09 += v13; v05 ^= v09; v05 = (v05 >> 7) ^ (v05 << 25); v00 += m10; v00 += v05; v15 ^= v00; v15 = (v15 >> 16) ^ (v15 << 16); v10 += v15; v05 ^= v10; v05 = (v05 >> 12) ^ (v05 << 20); v01 += m03; v01 += v06; v12 ^= v01; v12 = (v12 >> 16) ^ (v12 << 16); v11 += v12; v06 ^= v11; v06 = (v06 >> 12) ^ (v06 << 20); v02 += m07; v02 += v07; v13 ^= v02; v13 = (v13 >> 16) ^ (v13 << 16); v08 += v13; v07 ^= v08; v07 = (v07 >> 12) ^ (v07 << 20); v03 += m09; v03 += v04; v14 ^= v03; v14 = (v14 >> 16) ^ (v14 << 16); v09 += v14; v04 ^= v09; v04 = (v04 >> 12) ^ (v04 << 20); v02 += m01; v02 += v07; v13 ^= v02; v13 = (v13 >> 8) ^ (v13 << 24); v08 += v13; v07 ^= v08; v07 = (v07 >> 7) ^ (v07 << 25); v03 += m04; v03 += v04; v14 ^= v03; v14 = (v14 >> 8) ^ (v14 << 24); v09 += v14; v04 ^= v09; v04 = (v04 >> 7) ^ (v04 << 25); v00 += m14; v00 += v05; v15 ^= v00; v15 = (v15 >> 8) ^ (v15 << 24); v10 += v15; v05 ^= v10; v05 = (v05 >> 7) ^ (v05 << 25); v01 += m06; v01 += v06; v12 ^= v01; v12 = (v12 >> 8) ^ (v12 << 24); v11 += v12; v06 ^= v11; v06 = (v06 >> 7) ^ (v06 << 25); //ROUND 4 v00 += m07; v00 += v04; v12 ^= v00; v12 = (v12 >> 16) ^ (v12 << 16); v08 += v12; v04 ^= v08; v04 = (v04 >> 12) ^ (v04 << 20); v01 += m03; v01 += v05; v13 ^= v01; v13 = (v13 >> 16) ^ (v13 << 16); v09 += v13; v05 ^= v09; v05 = (v05 >> 12) ^ (v05 << 20); v02 += m13; v02 += v06; v14 ^= v02; v14 = (v14 >> 16) ^ (v14 << 16); v10 += v14; v06 ^= v10; v06 = (v06 >> 12) ^ (v06 << 20); v03 += m11; v03 += v07; v15 ^= v03; v15 = (v15 >> 16) ^ (v15 << 16); v11 += v15; v07 ^= v11; v07 = (v07 >> 12) ^ (v07 << 20); v02 += m12; v02 += v06; v14 ^= v02; v14 = (v14 >> 8) ^ (v14 << 24); v10 += v14; v06 ^= v10; v06 = (v06 >> 7) ^ (v06 << 25); v03 += m14; v03 += v07; v15 ^= v03; v15 = (v15 >> 8) ^ (v15 << 24); v11 += v15; v07 ^= v11; v07 = (v07 >> 7) ^ (v07 << 25); v00 += m09; v00 += v04; v12 ^= v00; v12 = (v12 >> 8) ^ (v12 << 24); v08 += v12; v04 ^= v08; v04 = (v04 >> 7) ^ (v04 << 25); v01 += m01; v01 += v05; v13 ^= v01; v13 = (v13 >> 8) ^ (v13 << 24); v09 += v13; v05 ^= v09; v05 = (v05 >> 7) ^ (v05 << 25); v00 += m02; v00 += v05; v15 ^= v00; v15 = (v15 >> 16) ^ (v15 << 16); v10 += v15; v05 ^= v10; v05 = (v05 >> 12) ^ (v05 << 20); v01 += m05; v01 += v06; v12 ^= v01; v12 = (v12 >> 16) ^ (v12 << 16); v11 += v12; v06 ^= v11; v06 = (v06 >> 12) ^ (v06 << 20); v02 += m04; v02 += v07; v13 ^= v02; v13 = (v13 >> 16) ^ (v13 << 16); v08 += v13; v07 ^= v08; v07 = (v07 >> 12) ^ (v07 << 20); v03 += m15; v03 += v04; v14 ^= v03; v14 = (v14 >> 16) ^ (v14 << 16); v09 += v14; v04 ^= v09; v04 = (v04 >> 12) ^ (v04 << 20); v02 += m00; v02 += v07; v13 ^= v02; v13 = (v13 >> 8) ^ (v13 << 24); v08 += v13; v07 ^= v08; v07 = (v07 >> 7) ^ (v07 << 25); v03 += m08; v03 += v04; v14 ^= v03; v14 = (v14 >> 8) ^ (v14 << 24); v09 += v14; v04 ^= v09; v04 = (v04 >> 7) ^ (v04 << 25); v00 += m06; v00 += v05; v15 ^= v00; v15 = (v15 >> 8) ^ (v15 << 24); v10 += v15; v05 ^= v10; v05 = (v05 >> 7) ^ (v05 << 25); v01 += m10; v01 += v06; v12 ^= v01; v12 = (v12 >> 8) ^ (v12 << 24); v11 += v12; v06 ^= v11; v06 = (v06 >> 7) ^ (v06 << 25); //ROUND 5 v00 += m09; v00 += v04; v12 ^= v00; v12 = (v12 >> 16) ^ (v12 << 16); v08 += v12; v04 ^= v08; v04 = (v04 >> 12) ^ (v04 << 20); v01 += m05; v01 += v05; v13 ^= v01; v13 = (v13 >> 16) ^ (v13 << 16); v09 += v13; v05 ^= v09; v05 = (v05 >> 12) ^ (v05 << 20); v02 += m02; v02 += v06; v14 ^= v02; v14 = (v14 >> 16) ^ (v14 << 16); v10 += v14; v06 ^= v10; v06 = (v06 >> 12) ^ (v06 << 20); v03 += m10; v03 += v07; v15 ^= v03; v15 = (v15 >> 16) ^ (v15 << 16); v11 += v15; v07 ^= v11; v07 = (v07 >> 12) ^ (v07 << 20); v02 += m04; v02 += v06; v14 ^= v02; v14 = (v14 >> 8) ^ (v14 << 24); v10 += v14; v06 ^= v10; v06 = (v06 >> 7) ^ (v06 << 25); v03 += m15; v03 += v07; v15 ^= v03; v15 = (v15 >> 8) ^ (v15 << 24); v11 += v15; v07 ^= v11; v07 = (v07 >> 7) ^ (v07 << 25); v00 += m00; v00 += v04; v12 ^= v00; v12 = (v12 >> 8) ^ (v12 << 24); v08 += v12; v04 ^= v08; v04 = (v04 >> 7) ^ (v04 << 25); v01 += m07; v01 += v05; v13 ^= v01; v13 = (v13 >> 8) ^ (v13 << 24); v09 += v13; v05 ^= v09; v05 = (v05 >> 7) ^ (v05 << 25); v00 += m14; v00 += v05; v15 ^= v00; v15 = (v15 >> 16) ^ (v15 << 16); v10 += v15; v05 ^= v10; v05 = (v05 >> 12) ^ (v05 << 20); v01 += m11; v01 += v06; v12 ^= v01; v12 = (v12 >> 16) ^ (v12 << 16); v11 += v12; v06 ^= v11; v06 = (v06 >> 12) ^ (v06 << 20); v02 += m06; v02 += v07; v13 ^= v02; v13 = (v13 >> 16) ^ (v13 << 16); v08 += v13; v07 ^= v08; v07 = (v07 >> 12) ^ (v07 << 20); v03 += m03; v03 += v04; v14 ^= v03; v14 = (v14 >> 16) ^ (v14 << 16); v09 += v14; v04 ^= v09; v04 = (v04 >> 12) ^ (v04 << 20); v02 += m08; v02 += v07; v13 ^= v02; v13 = (v13 >> 8) ^ (v13 << 24); v08 += v13; v07 ^= v08; v07 = (v07 >> 7) ^ (v07 << 25); v03 += m13; v03 += v04; v14 ^= v03; v14 = (v14 >> 8) ^ (v14 << 24); v09 += v14; v04 ^= v09; v04 = (v04 >> 7) ^ (v04 << 25); v00 += m01; v00 += v05; v15 ^= v00; v15 = (v15 >> 8) ^ (v15 << 24); v10 += v15; v05 ^= v10; v05 = (v05 >> 7) ^ (v05 << 25); v01 += m12; v01 += v06; v12 ^= v01; v12 = (v12 >> 8) ^ (v12 << 24); v11 += v12; v06 ^= v11; v06 = (v06 >> 7) ^ (v06 << 25); //ROUND 6 v00 += m02; v00 += v04; v12 ^= v00; v12 = (v12 >> 16) ^ (v12 << 16); v08 += v12; v04 ^= v08; v04 = (v04 >> 12) ^ (v04 << 20); v01 += m06; v01 += v05; v13 ^= v01; v13 = (v13 >> 16) ^ (v13 << 16); v09 += v13; v05 ^= v09; v05 = (v05 >> 12) ^ (v05 << 20); v02 += m00; v02 += v06; v14 ^= v02; v14 = (v14 >> 16) ^ (v14 << 16); v10 += v14; v06 ^= v10; v06 = (v06 >> 12) ^ (v06 << 20); v03 += m08; v03 += v07; v15 ^= v03; v15 = (v15 >> 16) ^ (v15 << 16); v11 += v15; v07 ^= v11; v07 = (v07 >> 12) ^ (v07 << 20); v02 += m11; v02 += v06; v14 ^= v02; v14 = (v14 >> 8) ^ (v14 << 24); v10 += v14; v06 ^= v10; v06 = (v06 >> 7) ^ (v06 << 25); v03 += m03; v03 += v07; v15 ^= v03; v15 = (v15 >> 8) ^ (v15 << 24); v11 += v15; v07 ^= v11; v07 = (v07 >> 7) ^ (v07 << 25); v00 += m12; v00 += v04; v12 ^= v00; v12 = (v12 >> 8) ^ (v12 << 24); v08 += v12; v04 ^= v08; v04 = (v04 >> 7) ^ (v04 << 25); v01 += m10; v01 += v05; v13 ^= v01; v13 = (v13 >> 8) ^ (v13 << 24); v09 += v13; v05 ^= v09; v05 = (v05 >> 7) ^ (v05 << 25); v00 += m04; v00 += v05; v15 ^= v00; v15 = (v15 >> 16) ^ (v15 << 16); v10 += v15; v05 ^= v10; v05 = (v05 >> 12) ^ (v05 << 20); v01 += m07; v01 += v06; v12 ^= v01; v12 = (v12 >> 16) ^ (v12 << 16); v11 += v12; v06 ^= v11; v06 = (v06 >> 12) ^ (v06 << 20); v02 += m15; v02 += v07; v13 ^= v02; v13 = (v13 >> 16) ^ (v13 << 16); v08 += v13; v07 ^= v08; v07 = (v07 >> 12) ^ (v07 << 20); v03 += m01; v03 += v04; v14 ^= v03; v14 = (v14 >> 16) ^ (v14 << 16); v09 += v14; v04 ^= v09; v04 = (v04 >> 12) ^ (v04 << 20); v02 += m14; v02 += v07; v13 ^= v02; v13 = (v13 >> 8) ^ (v13 << 24); v08 += v13; v07 ^= v08; v07 = (v07 >> 7) ^ (v07 << 25); v03 += m09; v03 += v04; v14 ^= v03; v14 = (v14 >> 8) ^ (v14 << 24); v09 += v14; v04 ^= v09; v04 = (v04 >> 7) ^ (v04 << 25); v00 += m13; v00 += v05; v15 ^= v00; v15 = (v15 >> 8) ^ (v15 << 24); v10 += v15; v05 ^= v10; v05 = (v05 >> 7) ^ (v05 << 25); v01 += m05; v01 += v06; v12 ^= v01; v12 = (v12 >> 8) ^ (v12 << 24); v11 += v12; v06 ^= v11; v06 = (v06 >> 7) ^ (v06 << 25); //ROUND 7 v00 += m12; v00 += v04; v12 ^= v00; v12 = (v12 >> 16) ^ (v12 << 16); v08 += v12; v04 ^= v08; v04 = (v04 >> 12) ^ (v04 << 20); v01 += m01; v01 += v05; v13 ^= v01; v13 = (v13 >> 16) ^ (v13 << 16); v09 += v13; v05 ^= v09; v05 = (v05 >> 12) ^ (v05 << 20); v02 += m14; v02 += v06; v14 ^= v02; v14 = (v14 >> 16) ^ (v14 << 16); v10 += v14; v06 ^= v10; v06 = (v06 >> 12) ^ (v06 << 20); v03 += m04; v03 += v07; v15 ^= v03; v15 = (v15 >> 16) ^ (v15 << 16); v11 += v15; v07 ^= v11; v07 = (v07 >> 12) ^ (v07 << 20); v02 += m13; v02 += v06; v14 ^= v02; v14 = (v14 >> 8) ^ (v14 << 24); v10 += v14; v06 ^= v10; v06 = (v06 >> 7) ^ (v06 << 25); v03 += m10; v03 += v07; v15 ^= v03; v15 = (v15 >> 8) ^ (v15 << 24); v11 += v15; v07 ^= v11; v07 = (v07 >> 7) ^ (v07 << 25); v00 += m05; v00 += v04; v12 ^= v00; v12 = (v12 >> 8) ^ (v12 << 24); v08 += v12; v04 ^= v08; v04 = (v04 >> 7) ^ (v04 << 25); v01 += m15; v01 += v05; v13 ^= v01; v13 = (v13 >> 8) ^ (v13 << 24); v09 += v13; v05 ^= v09; v05 = (v05 >> 7) ^ (v05 << 25); v00 += m00; v00 += v05; v15 ^= v00; v15 = (v15 >> 16) ^ (v15 << 16); v10 += v15; v05 ^= v10; v05 = (v05 >> 12) ^ (v05 << 20); v01 += m06; v01 += v06; v12 ^= v01; v12 = (v12 >> 16) ^ (v12 << 16); v11 += v12; v06 ^= v11; v06 = (v06 >> 12) ^ (v06 << 20); v02 += m09; v02 += v07; v13 ^= v02; v13 = (v13 >> 16) ^ (v13 << 16); v08 += v13; v07 ^= v08; v07 = (v07 >> 12) ^ (v07 << 20); v03 += m08; v03 += v04; v14 ^= v03; v14 = (v14 >> 16) ^ (v14 << 16); v09 += v14; v04 ^= v09; v04 = (v04 >> 12) ^ (v04 << 20); v02 += m02; v02 += v07; v13 ^= v02; v13 = (v13 >> 8) ^ (v13 << 24); v08 += v13; v07 ^= v08; v07 = (v07 >> 7) ^ (v07 << 25); v03 += m11; v03 += v04; v14 ^= v03; v14 = (v14 >> 8) ^ (v14 << 24); v09 += v14; v04 ^= v09; v04 = (v04 >> 7) ^ (v04 << 25); v00 += m07; v00 += v05; v15 ^= v00; v15 = (v15 >> 8) ^ (v15 << 24); v10 += v15; v05 ^= v10; v05 = (v05 >> 7) ^ (v05 << 25); v01 += m03; v01 += v06; v12 ^= v01; v12 = (v12 >> 8) ^ (v12 << 24); v11 += v12; v06 ^= v11; v06 = (v06 >> 7) ^ (v06 << 25); //ROUND 8 v00 += m13; v00 += v04; v12 ^= v00; v12 = (v12 >> 16) ^ (v12 << 16); v08 += v12; v04 ^= v08; v04 = (v04 >> 12) ^ (v04 << 20); v01 += m07; v01 += v05; v13 ^= v01; v13 = (v13 >> 16) ^ (v13 << 16); v09 += v13; v05 ^= v09; v05 = (v05 >> 12) ^ (v05 << 20); v02 += m12; v02 += v06; v14 ^= v02; v14 = (v14 >> 16) ^ (v14 << 16); v10 += v14; v06 ^= v10; v06 = (v06 >> 12) ^ (v06 << 20); v03 += m03; v03 += v07; v15 ^= v03; v15 = (v15 >> 16) ^ (v15 << 16); v11 += v15; v07 ^= v11; v07 = (v07 >> 12) ^ (v07 << 20); v02 += m01; v02 += v06; v14 ^= v02; v14 = (v14 >> 8) ^ (v14 << 24); v10 += v14; v06 ^= v10; v06 = (v06 >> 7) ^ (v06 << 25); v03 += m09; v03 += v07; v15 ^= v03; v15 = (v15 >> 8) ^ (v15 << 24); v11 += v15; v07 ^= v11; v07 = (v07 >> 7) ^ (v07 << 25); v00 += m11; v00 += v04; v12 ^= v00; v12 = (v12 >> 8) ^ (v12 << 24); v08 += v12; v04 ^= v08; v04 = (v04 >> 7) ^ (v04 << 25); v01 += m14; v01 += v05; v13 ^= v01; v13 = (v13 >> 8) ^ (v13 << 24); v09 += v13; v05 ^= v09; v05 = (v05 >> 7) ^ (v05 << 25); v00 += m05; v00 += v05; v15 ^= v00; v15 = (v15 >> 16) ^ (v15 << 16); v10 += v15; v05 ^= v10; v05 = (v05 >> 12) ^ (v05 << 20); v01 += m15; v01 += v06; v12 ^= v01; v12 = (v12 >> 16) ^ (v12 << 16); v11 += v12; v06 ^= v11; v06 = (v06 >> 12) ^ (v06 << 20); v02 += m08; v02 += v07; v13 ^= v02; v13 = (v13 >> 16) ^ (v13 << 16); v08 += v13; v07 ^= v08; v07 = (v07 >> 12) ^ (v07 << 20); v03 += m02; v03 += v04; v14 ^= v03; v14 = (v14 >> 16) ^ (v14 << 16); v09 += v14; v04 ^= v09; v04 = (v04 >> 12) ^ (v04 << 20); v02 += m06; v02 += v07; v13 ^= v02; v13 = (v13 >> 8) ^ (v13 << 24); v08 += v13; v07 ^= v08; v07 = (v07 >> 7) ^ (v07 << 25); v03 += m10; v03 += v04; v14 ^= v03; v14 = (v14 >> 8) ^ (v14 << 24); v09 += v14; v04 ^= v09; v04 = (v04 >> 7) ^ (v04 << 25); v00 += m00; v00 += v05; v15 ^= v00; v15 = (v15 >> 8) ^ (v15 << 24); v10 += v15; v05 ^= v10; v05 = (v05 >> 7) ^ (v05 << 25); v01 += m04; v01 += v06; v12 ^= v01; v12 = (v12 >> 8) ^ (v12 << 24); v11 += v12; v06 ^= v11; v06 = (v06 >> 7) ^ (v06 << 25); //ROUND 9 v00 += m06; v00 += v04; v12 ^= v00; v12 = (v12 >> 16) ^ (v12 << 16); v08 += v12; v04 ^= v08; v04 = (v04 >> 12) ^ (v04 << 20); v01 += m14; v01 += v05; v13 ^= v01; v13 = (v13 >> 16) ^ (v13 << 16); v09 += v13; v05 ^= v09; v05 = (v05 >> 12) ^ (v05 << 20); v02 += m11; v02 += v06; v14 ^= v02; v14 = (v14 >> 16) ^ (v14 << 16); v10 += v14; v06 ^= v10; v06 = (v06 >> 12) ^ (v06 << 20); v03 += m00; v03 += v07; v15 ^= v03; v15 = (v15 >> 16) ^ (v15 << 16); v11 += v15; v07 ^= v11; v07 = (v07 >> 12) ^ (v07 << 20); v02 += m03; v02 += v06; v14 ^= v02; v14 = (v14 >> 8) ^ (v14 << 24); v10 += v14; v06 ^= v10; v06 = (v06 >> 7) ^ (v06 << 25); v03 += m08; v03 += v07; v15 ^= v03; v15 = (v15 >> 8) ^ (v15 << 24); v11 += v15; v07 ^= v11; v07 = (v07 >> 7) ^ (v07 << 25); v00 += m15; v00 += v04; v12 ^= v00; v12 = (v12 >> 8) ^ (v12 << 24); v08 += v12; v04 ^= v08; v04 = (v04 >> 7) ^ (v04 << 25); v01 += m09; v01 += v05; v13 ^= v01; v13 = (v13 >> 8) ^ (v13 << 24); v09 += v13; v05 ^= v09; v05 = (v05 >> 7) ^ (v05 << 25); v00 += m12; v00 += v05; v15 ^= v00; v15 = (v15 >> 16) ^ (v15 << 16); v10 += v15; v05 ^= v10; v05 = (v05 >> 12) ^ (v05 << 20); v01 += m13; v01 += v06; v12 ^= v01; v12 = (v12 >> 16) ^ (v12 << 16); v11 += v12; v06 ^= v11; v06 = (v06 >> 12) ^ (v06 << 20); v02 += m01; v02 += v07; v13 ^= v02; v13 = (v13 >> 16) ^ (v13 << 16); v08 += v13; v07 ^= v08; v07 = (v07 >> 12) ^ (v07 << 20); v03 += m10; v03 += v04; v14 ^= v03; v14 = (v14 >> 16) ^ (v14 << 16); v09 += v14; v04 ^= v09; v04 = (v04 >> 12) ^ (v04 << 20); v02 += m04; v02 += v07; v13 ^= v02; v13 = (v13 >> 8) ^ (v13 << 24); v08 += v13; v07 ^= v08; v07 = (v07 >> 7) ^ (v07 << 25); v03 += m05; v03 += v04; v14 ^= v03; v14 = (v14 >> 8) ^ (v14 << 24); v09 += v14; v04 ^= v09; v04 = (v04 >> 7) ^ (v04 << 25); v00 += m02; v00 += v05; v15 ^= v00; v15 = (v15 >> 8) ^ (v15 << 24); v10 += v15; v05 ^= v10; v05 = (v05 >> 7) ^ (v05 << 25); v01 += m07; v01 += v06; v12 ^= v01; v12 = (v12 >> 8) ^ (v12 << 24); v11 += v12; v06 ^= v11; v06 = (v06 >> 7) ^ (v06 << 25); //ROUND 10 v00 += m10; v00 += v04; v12 ^= v00; v12 = (v12 >> 16) ^ (v12 << 16); v08 += v12; v04 ^= v08; v04 = (v04 >> 12) ^ (v04 << 20); v01 += m08; v01 += v05; v13 ^= v01; v13 = (v13 >> 16) ^ (v13 << 16); v09 += v13; v05 ^= v09; v05 = (v05 >> 12) ^ (v05 << 20); v02 += m07; v02 += v06; v14 ^= v02; v14 = (v14 >> 16) ^ (v14 << 16); v10 += v14; v06 ^= v10; v06 = (v06 >> 12) ^ (v06 << 20); v03 += m01; v03 += v07; v15 ^= v03; v15 = (v15 >> 16) ^ (v15 << 16); v11 += v15; v07 ^= v11; v07 = (v07 >> 12) ^ (v07 << 20); v02 += m06; v02 += v06; v14 ^= v02; v14 = (v14 >> 8) ^ (v14 << 24); v10 += v14; v06 ^= v10; v06 = (v06 >> 7) ^ (v06 << 25); v03 += m05; v03 += v07; v15 ^= v03; v15 = (v15 >> 8) ^ (v15 << 24); v11 += v15; v07 ^= v11; v07 = (v07 >> 7) ^ (v07 << 25); v00 += m02; v00 += v04; v12 ^= v00; v12 = (v12 >> 8) ^ (v12 << 24); v08 += v12; v04 ^= v08; v04 = (v04 >> 7) ^ (v04 << 25); v01 += m04; v01 += v05; v13 ^= v01; v13 = (v13 >> 8) ^ (v13 << 24); v09 += v13; v05 ^= v09; v05 = (v05 >> 7) ^ (v05 << 25); v00 += m15; v00 += v05; v15 ^= v00; v15 = (v15 >> 16) ^ (v15 << 16); v10 += v15; v05 ^= v10; v05 = (v05 >> 12) ^ (v05 << 20); v01 += m09; v01 += v06; v12 ^= v01; v12 = (v12 >> 16) ^ (v12 << 16); v11 += v12; v06 ^= v11; v06 = (v06 >> 12) ^ (v06 << 20); v02 += m03; v02 += v07; v13 ^= v02; v13 = (v13 >> 16) ^ (v13 << 16); v08 += v13; v07 ^= v08; v07 = (v07 >> 12) ^ (v07 << 20); v03 += m13; v03 += v04; v14 ^= v03; v14 = (v14 >> 16) ^ (v14 << 16); v09 += v14; v04 ^= v09; v04 = (v04 >> 12) ^ (v04 << 20); v02 += m12; v02 += v07; v13 ^= v02; v13 = (v13 >> 8) ^ (v13 << 24); v08 += v13; v07 ^= v08; v07 = (v07 >> 7) ^ (v07 << 25); v03 += m00; v03 += v04; v14 ^= v03; v14 = (v14 >> 8) ^ (v14 << 24); v09 += v14; v04 ^= v09; v04 = (v04 >> 7) ^ (v04 << 25); v00 += m11; v00 += v05; v15 ^= v00; v15 = (v15 >> 8) ^ (v15 << 24); v10 += v15; v05 ^= v10; v05 = (v05 >> 7) ^ (v05 << 25); v01 += m14; v01 += v06; v12 ^= v01; v12 = (v12 >> 8) ^ (v12 << 24); v11 += v12; v06 ^= v11; v06 = (v06 >> 7) ^ (v06 << 25); s->h[0] ^= v00 ^ v08; s->h[1] ^= v01 ^ v09; s->h[2] ^= v02 ^ v10; s->h[3] ^= v03 ^ v11; s->h[4] ^= v04 ^ v12; s->h[5] ^= v05 ^ v13; s->h[6] ^= v06 ^ v14; s->h[7] ^= v07 ^ v15; }
unsafe private static void mixSse41(Blake2sContext *s, uint *m) { var row1 = Sse2.LoadVector128(s->h); var row2 = Sse2.LoadVector128(s->h + 4); var row3 = v128iv0; var row4 = v128iv1; row4 = Sse2.Xor(row4, Sse2.LoadVector128(s->t)); // reads into f[] as well var m0 = Sse2.LoadVector128(m); var m1 = Sse2.LoadVector128(m + 4); var m2 = Sse2.LoadVector128(m + 8); var m3 = Sse2.LoadVector128(m + 12); var r16 = v128rm0; var r8 = v128rm1; //ROUND 1 #if OLD_INTRINSICS var b0 = Sse.StaticCast <float, uint>(Sse.Shuffle(Sse.StaticCast <uint, float>(m0), Sse.StaticCast <uint, float>(m1), 0b_10_00_10_00)); #else var b0 = Sse.Shuffle(m0.AsSingle(), m1.AsSingle(), 0b_10_00_10_00).AsUInt32(); #endif //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); #if OLD_INTRINSICS b0 = Sse.StaticCast <float, uint>(Sse.Shuffle(Sse.StaticCast <uint, float>(m0), Sse.StaticCast <uint, float>(m1), 0b_11_01_11_01)); #else b0 = Sse.Shuffle(m0.AsSingle(), m1.AsSingle(), 0b_11_01_11_01).AsUInt32(); #endif //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); #if OLD_INTRINSICS b0 = Sse.StaticCast <float, uint>(Sse.Shuffle(Sse.StaticCast <uint, float>(m2), Sse.StaticCast <uint, float>(m3), 0b_10_00_10_00)); #else b0 = Sse.Shuffle(m2.AsSingle(), m3.AsSingle(), 0b_10_00_10_00).AsUInt32(); #endif //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); #if OLD_INTRINSICS b0 = Sse.StaticCast <float, uint>(Sse.Shuffle(Sse.StaticCast <uint, float>(m2), Sse.StaticCast <uint, float>(m3), 0b_11_01_11_01)); #else b0 = Sse.Shuffle(m2.AsSingle(), m3.AsSingle(), 0b_11_01_11_01).AsUInt32(); #endif //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); //ROUND 2 #if OLD_INTRINSICS var t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m2), 0b_00_00_11_00)); #else var t0 = Sse41.Blend(m1.AsUInt16(), m2.AsUInt16(), 0b_00_00_11_00).AsUInt32(); #endif var t1 = Sse2.ShiftLeftLogical128BitLane(m3, 4); #if OLD_INTRINSICS var t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_11_11_00_00)); #else var t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_10_01_00_11); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.Shuffle(m2, 0b_00_00_10_00); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m3), 0b_11_00_00_00)); t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_11_11_00_00)); #else t1 = Sse41.Blend(m1.AsUInt16(), m3.AsUInt16(), 0b_11_00_00_00).AsUInt32(); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_10_11_00_01); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); t0 = Sse2.ShiftLeftLogical128BitLane(m1, 4); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m2), Sse.StaticCast <uint, ushort>(t0), 0b_00_11_00_00)); t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(t1), 0b_11_11_00_00)); #else t1 = Sse41.Blend(m2.AsUInt16(), t0.AsUInt16(), 0b_00_11_00_00).AsUInt32(); t2 = Sse41.Blend(m0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_10_11_00_01); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.UnpackHigh(m0, m1); t1 = Sse2.ShiftLeftLogical128BitLane(m3, 4); #if OLD_INTRINSICS t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_11_00)); #else t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_10_11_00_01); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); //ROUND 3 t0 = Sse2.UnpackHigh(m2, m3); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m3), Sse.StaticCast <uint, ushort>(m1), 0b_00_00_11_00)); t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_11_11)); #else t1 = Sse41.Blend(m3.AsUInt16(), m1.AsUInt16(), 0b_00_00_11_00).AsUInt32(); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_11).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_11_01_00_10); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.UnpackLow(m2, m0); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(m0), 0b_11_11_00_00)); #else t1 = Sse41.Blend(t0.AsUInt16(), m0.AsUInt16(), 0b_11_11_00_00).AsUInt32(); #endif t2 = Sse2.ShiftLeftLogical128BitLane(m3, 8); #if OLD_INTRINSICS b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t2), 0b_11_00_00_00)); #else b0 = Sse41.Blend(t1.AsUInt16(), t2.AsUInt16(), 0b_11_00_00_00).AsUInt32(); #endif //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m2), 0b_00_11_11_00)); #else t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_11_11_00).AsUInt32(); #endif t1 = Sse2.ShiftRightLogical128BitLane(m1, 12); #if OLD_INTRINSICS t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_00_11)); #else t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_01_00_11_10); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.ShiftLeftLogical128BitLane(m3, 4); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m1), 0b_00_11_00_11)); t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_11_00_00_00)); #else t1 = Sse41.Blend(m0.AsUInt16(), m1.AsUInt16(), 0b_00_11_00_11).AsUInt32(); t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_00_01_10_11); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); //ROUND 4 t0 = Sse2.UnpackHigh(m0, m1); t1 = Sse2.UnpackHigh(t0, m2); #if OLD_INTRINSICS t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_11_00)); #else t2 = Sse41.Blend(t1.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_11_01_00_10); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.ShiftLeftLogical128BitLane(m2, 8); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m3), Sse.StaticCast <uint, ushort>(m0), 0b_00_00_11_00)); t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_11_00_00_00)); #else t1 = Sse41.Blend(m3.AsUInt16(), m0.AsUInt16(), 0b_00_00_11_00).AsUInt32(); t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_10_00_01_11); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m1), 0b_00_00_11_11)); t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(m3), 0b_11_00_00_00)); #else t0 = Sse41.Blend(m0.AsUInt16(), m1.AsUInt16(), 0b_00_00_11_11).AsUInt32(); t1 = Sse41.Blend(t0.AsUInt16(), m3.AsUInt16(), 0b_11_00_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t1, 0b_11_00_01_10); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.UnpackLow(m0, m2); t1 = Sse2.UnpackHigh(m1, m2); #if OLD_INTRINSICS b0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(t1), Sse.StaticCast <uint, ulong>(t0))); #else b0 = Sse2.UnpackLow(t1.AsUInt64(), t0.AsUInt64()).AsUInt32(); #endif //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); //ROUND 5 #if OLD_INTRINSICS t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m1), Sse.StaticCast <uint, ulong>(m2))); t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(m2))); t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_00_11)); #else t0 = Sse2.UnpackLow(m1.AsUInt64(), m2.AsUInt64()).AsUInt32(); t1 = Sse2.UnpackHigh(m0.AsUInt64(), m2.AsUInt64()).AsUInt32(); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_10_00_01_11); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); #if OLD_INTRINSICS t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m1), Sse.StaticCast <uint, ulong>(m3))); t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(m1))); b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_00_11)); #else t0 = Sse2.UnpackHigh(m1.AsUInt64(), m3.AsUInt64()).AsUInt32(); t1 = Sse2.UnpackLow(m0.AsUInt64(), m1.AsUInt64()).AsUInt32(); b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32(); #endif //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); #if OLD_INTRINSICS t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m3), Sse.StaticCast <uint, ulong>(m1))); t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m2), Sse.StaticCast <uint, ulong>(m0))); b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_00_11_00_11)); #else t0 = Sse2.UnpackHigh(m3.AsUInt64(), m1.AsUInt64()).AsUInt32(); t1 = Sse2.UnpackHigh(m2.AsUInt64(), m0.AsUInt64()).AsUInt32(); b0 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_11_00_11).AsUInt32(); #endif //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m2), 0b_00_00_00_11)); #else t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_00_00_11).AsUInt32(); #endif t1 = Sse2.ShiftLeftLogical128BitLane(t0, 8); #if OLD_INTRINSICS t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_11_11)); #else t2 = Sse41.Blend(t1.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_11).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_01_10_00_11); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); //ROUND 6 t0 = Sse2.UnpackHigh(m0, m1); t1 = Sse2.UnpackLow(m0, m2); #if OLD_INTRINSICS b0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(t0), Sse.StaticCast <uint, ulong>(t1))); #else b0 = Sse2.UnpackLow(t0.AsUInt64(), t1.AsUInt64()).AsUInt32(); #endif //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.ShiftRightLogical128BitLane(m2, 4); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_00_11)); b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_00_11_11_00)); #else t1 = Sse41.Blend(m0.AsUInt16(), m3.AsUInt16(), 0b_00_00_00_11).AsUInt32(); b0 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_11_11_00).AsUInt32(); #endif //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m0), 0b_00_00_11_00)); #else t0 = Sse41.Blend(m1.AsUInt16(), m0.AsUInt16(), 0b_00_00_11_00).AsUInt32(); #endif t1 = Sse2.ShiftRightLogical128BitLane(m3, 4); #if OLD_INTRINSICS t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_00_00)); #else t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_01_10_11_00); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); #if OLD_INTRINSICS t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m1), Sse.StaticCast <uint, ulong>(m2))); #else t0 = Sse2.UnpackLow(m1.AsUInt64(), m2.AsUInt64()).AsUInt32(); #endif t1 = Sse2.Shuffle(m3, 0b_00_10_00_01); #if OLD_INTRINSICS b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_00_11)); #else b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32(); #endif //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); //ROUND 7 t0 = Sse2.ShiftLeftLogical128BitLane(m1, 12); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m3), 0b_00_11_00_11)); b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_11_00_00_00)); #else t1 = Sse41.Blend(m0.AsUInt16(), m3.AsUInt16(), 0b_00_11_00_11).AsUInt32(); b0 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); #endif //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m3), Sse.StaticCast <uint, ushort>(m2), 0b_00_11_00_00)); #else t0 = Sse41.Blend(m3.AsUInt16(), m2.AsUInt16(), 0b_00_11_00_00).AsUInt32(); #endif t1 = Sse2.ShiftRightLogical128BitLane(m1, 4); #if OLD_INTRINSICS t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_00_11)); #else t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_10_01_11_00); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); #if OLD_INTRINSICS t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(m2))); #else t0 = Sse2.UnpackLow(m0.AsUInt64(), m2.AsUInt64()).AsUInt32(); #endif t1 = Sse2.ShiftRightLogical128BitLane(m1, 4); #if OLD_INTRINSICS b0 = Sse2.Shuffle(Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_11_00)), 0b_10_11_01_00); #else b0 = Sse2.Shuffle(Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_00).AsUInt32(), 0b_10_11_01_00); #endif //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.UnpackHigh(m1, m2); #if OLD_INTRINSICS t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(t0))); #else t1 = Sse2.UnpackHigh(m0.AsUInt64(), t0.AsUInt64()).AsUInt32(); #endif b0 = Sse2.Shuffle(t1, 0b_11_00_01_10); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); //ROUND 8 t0 = Sse2.UnpackHigh(m0, m1); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_11_11)); #else t1 = Sse41.Blend(t0.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_11).AsUInt32(); #endif b0 = Sse2.Shuffle(t1, 0b_10_00_11_01); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m2), Sse.StaticCast <uint, ushort>(m3), 0b_00_11_00_00)); #else t0 = Sse41.Blend(m2.AsUInt16(), m3.AsUInt16(), 0b_00_11_00_00).AsUInt32(); #endif t1 = Sse2.ShiftRightLogical128BitLane(m0, 4); #if OLD_INTRINSICS t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_00_11)); #else t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_01_00_10_11); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); #if OLD_INTRINSICS t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(m3))); t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m1), Sse.StaticCast <uint, ulong>(m2))); t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_11_00)); #else t0 = Sse2.UnpackHigh(m0.AsUInt64(), m3.AsUInt64()).AsUInt32(); t1 = Sse2.UnpackLow(m1.AsUInt64(), m2.AsUInt64()).AsUInt32(); t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_11_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_00_10_11_01); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.UnpackLow(m0, m1); t1 = Sse2.UnpackHigh(m1, m2); #if OLD_INTRINSICS b0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(t0), Sse.StaticCast <uint, ulong>(t1))); #else b0 = Sse2.UnpackLow(t0.AsUInt64(), t1.AsUInt64()).AsUInt32(); #endif //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); //ROUND 9 t0 = Sse2.UnpackHigh(m1, m3); #if OLD_INTRINSICS t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(t0), Sse.StaticCast <uint, ulong>(m0))); t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(m2), 0b_11_00_00_00)); b0 = Sse.StaticCast <ushort, uint>(Sse2.ShuffleHigh(Sse.StaticCast <uint, ushort>(t2), 0b_01_00_11_10)); #else t1 = Sse2.UnpackLow(t0.AsUInt64(), m0.AsUInt64()).AsUInt32(); t2 = Sse41.Blend(t1.AsUInt16(), m2.AsUInt16(), 0b_11_00_00_00).AsUInt32(); b0 = Sse2.ShuffleHigh(t2.AsUInt16(), 0b_01_00_11_10).AsUInt32(); #endif //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.UnpackHigh(m0, m3); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m2), Sse.StaticCast <uint, ushort>(t0), 0b_11_11_00_00)); #else t1 = Sse41.Blend(m2.AsUInt16(), t0.AsUInt16(), 0b_11_11_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t1, 0b_00_10_01_11); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m2), Sse.StaticCast <uint, ushort>(m0), 0b_00_00_11_00)); #else t0 = Sse41.Blend(m2.AsUInt16(), m0.AsUInt16(), 0b_00_00_11_00).AsUInt32(); #endif t1 = Sse2.ShiftLeftLogical128BitLane(t0, 4); #if OLD_INTRINSICS b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_11_11)); #else b0 = Sse41.Blend(t1.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_11).AsUInt32(); #endif //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m0), 0b_00_11_00_00)); #else t0 = Sse41.Blend(m1.AsUInt16(), m0.AsUInt16(), 0b_00_11_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t0, 0b_01_00_11_10); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); //ROUND 10 #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m2), 0b_00_00_00_11)); t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m2), 0b_00_11_00_00)); t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_00_00_11_11)); #else t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_00_00_11).AsUInt32(); t1 = Sse41.Blend(m1.AsUInt16(), m2.AsUInt16(), 0b_00_11_00_00).AsUInt32(); t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_00_11_11).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_01_11_00_10); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); t0 = Sse2.ShiftLeftLogical128BitLane(m0, 4); #if OLD_INTRINSICS t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(t0), 0b_11_00_00_00)); #else t1 = Sse41.Blend(m1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32(); #endif b0 = Sse2.Shuffle(t1, 0b_01_10_00_11); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //DIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_10_01_00_11); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_00_11_10_01); t0 = Sse2.UnpackHigh(m0, m3); t1 = Sse2.UnpackLow(m2, m3); #if OLD_INTRINSICS t2 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(t0), Sse.StaticCast <uint, ulong>(t1))); #else t2 = Sse2.UnpackHigh(t0.AsUInt64(), t1.AsUInt64()).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_11_00_10_01); //G1 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20)); #if OLD_INTRINSICS t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m3), Sse.StaticCast <uint, ushort>(m2), 0b_11_00_00_00)); #else t0 = Sse41.Blend(m3.AsUInt16(), m2.AsUInt16(), 0b_11_00_00_00).AsUInt32(); #endif t1 = Sse2.UnpackLow(m0, m3); #if OLD_INTRINSICS t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_11_11)); #else t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_11).AsUInt32(); #endif b0 = Sse2.Shuffle(t2, 0b_00_01_10_11); //G2 row1 = Sse2.Add(Sse2.Add(row1, b0), row2); row4 = Sse2.Xor(row4, row1); #if OLD_INTRINSICS row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8)); #else row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32(); #endif row3 = Sse2.Add(row3, row4); row2 = Sse2.Xor(row2, row3); row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25)); //UNDIAGONALIZE row4 = Sse2.Shuffle(row4, 0b_00_11_10_01); row3 = Sse2.Shuffle(row3, 0b_01_00_11_10); row2 = Sse2.Shuffle(row2, 0b_10_01_00_11); row1 = Sse2.Xor(row1, row3); row2 = Sse2.Xor(row2, row4); row1 = Sse2.Xor(row1, Sse2.LoadVector128(s->h)); row2 = Sse2.Xor(row2, Sse2.LoadVector128(s->h + 4)); Sse2.Store(s->h, row1); Sse2.Store(s->h + 4, row2); }