public static Vector256 <T> Xor_Software <T>(Vector256 <T> left, Vector256 <T> right) where T : struct => Xor_Software(left.AsUInt64(), right.AsUInt64()).As <ulong, T>();
private unsafe static void Xxh3Accumulate512(Span <ulong> acc, ReadOnlySpan <byte> input, ReadOnlySpan <byte> secret) { if (Avx2.IsSupported) { fixed(ulong *pAcc = acc) { fixed(byte *pInput = input, pSecret = secret) { Vector256 <ulong> *xAcc = (Vector256 <ulong> *)pAcc; Vector256 <byte> * xInput = (Vector256 <byte> *)pInput; Vector256 <byte> * xSecret = (Vector256 <byte> *)pSecret; for (ulong i = 0; i < StripeLen / 32; i++) { Vector256 <byte> dataVec = xInput[i]; Vector256 <byte> keyVec = xSecret[i]; Vector256 <byte> dataKey = Avx2.Xor(dataVec, keyVec); Vector256 <uint> dataKeyLo = Avx2.Shuffle(dataKey.AsUInt32(), 0b00110001); Vector256 <ulong> product = Avx2.Multiply(dataKey.AsUInt32(), dataKeyLo); Vector256 <uint> dataSwap = Avx2.Shuffle(dataVec.AsUInt32(), 0b01001110); Vector256 <ulong> sum = Avx2.Add(xAcc[i], dataSwap.AsUInt64()); xAcc[i] = Avx2.Add(product, sum); } } } } else if (Sse2.IsSupported) { fixed(ulong *pAcc = acc) { fixed(byte *pInput = input, pSecret = secret) { Vector128 <ulong> *xAcc = (Vector128 <ulong> *)pAcc; Vector128 <byte> * xInput = (Vector128 <byte> *)pInput; Vector128 <byte> * xSecret = (Vector128 <byte> *)pSecret; for (ulong i = 0; i < StripeLen / 16; i++) { Vector128 <byte> dataVec = xInput[i]; Vector128 <byte> keyVec = xSecret[i]; Vector128 <byte> dataKey = Sse2.Xor(dataVec, keyVec); Vector128 <uint> dataKeyLo = Sse2.Shuffle(dataKey.AsUInt32(), 0b00110001); Vector128 <ulong> product = Sse2.Multiply(dataKey.AsUInt32(), dataKeyLo); Vector128 <uint> dataSwap = Sse2.Shuffle(dataVec.AsUInt32(), 0b01001110); Vector128 <ulong> sum = Sse2.Add(xAcc[i], dataSwap.AsUInt64()); xAcc[i] = Sse2.Add(product, sum); } } } } else { for (int i = 0; i < AccNb; i++) { ulong dataVal = BinaryPrimitives.ReadUInt64LittleEndian(input.Slice(i * sizeof(ulong))); ulong dataKey = dataVal ^ BinaryPrimitives.ReadUInt64LittleEndian(secret.Slice(i * sizeof(ulong))); acc[i ^ 1] += dataVal; acc[i] += Mult32To64((uint)dataKey, dataKey >> 32); } } }
public override void Step() { fixed(byte * currentFieldPtr = field, upperLineSumOf2Ptr = upperLineSumOf2, upperLineSumOf3Ptr = upperLineSumOf3, middleLineSumOf2Ptr = middleLineSumOf2, middleLineSumOf3Ptr = middleLineSumOf3, lowerLineSumOf2Ptr = lowerLineSumOf2, lowerLineSumOf3Ptr = lowerLineSumOf3) { byte *upper2 = upperLineSumOf2Ptr, upper3 = upperLineSumOf3Ptr; byte *middle2 = middleLineSumOf2Ptr, middle3 = middleLineSumOf3Ptr; byte *lower2 = lowerLineSumOf2Ptr, lower3 = lowerLineSumOf3Ptr; byte *nextLinePtr = currentFieldPtr + LINE_WIDTH; for (int x = 0; x < LINE_WIDTH; x += 32) { Avx2.Store(upper2 + x, Vector256 <byte> .Zero); Avx2.Store(upper3 + x, Vector256 <byte> .Zero); Avx2.Store(middle2 + x, Vector256 <byte> .Zero); Avx2.Store(middle3 + x, Vector256 <byte> .Zero); Vector256 <byte> nextLeft = Avx2.LoadVector256(nextLinePtr + x - 8); Vector256 <byte> nextCenter = Avx2.LoadVector256(nextLinePtr + x); Vector256 <byte> nextRight = Avx2.LoadVector256(nextLinePtr + x + 8); Vector256 <byte> lowerSum2 = Avx2.Add( Avx2.Add( Avx2.ShiftRightLogical(nextCenter.AsUInt64(), 4), Avx2.ShiftLeftLogical(nextCenter.AsUInt64(), 4)), Avx2.Add( Avx2.ShiftRightLogical(nextLeft.AsUInt64(), 60), Avx2.ShiftLeftLogical(nextRight.AsUInt64(), 60))).AsByte(); Avx2.Store(lower2 + x, lowerSum2); Vector256 <byte> lowerSum3 = Avx2.Add(lowerSum2, nextCenter); Avx2.Store(lower3 + x, lowerSum3); } for (int y = 1; y < HEIGHT - 1; y++) { nextLinePtr += LINE_WIDTH; byte *temp2 = upper2; byte *temp3 = upper3; upper2 = middle2; upper3 = middle3; middle2 = lower2; middle3 = lower3; lower2 = temp2; lower3 = temp3; for (int x = 0; x < LINE_WIDTH; x += 32) { Vector256 <byte> nextLeft = Avx2.LoadVector256(nextLinePtr + x - 8); Vector256 <byte> nextCenter = Avx2.LoadVector256(nextLinePtr + x); Vector256 <byte> nextRight = Avx2.LoadVector256(nextLinePtr + x + 8); Vector256 <byte> lowerSum2 = Avx2.Add( Avx2.Add( Avx2.ShiftRightLogical(nextCenter.AsUInt64(), 4), Avx2.ShiftLeftLogical(nextCenter.AsUInt64(), 4)), Avx2.Add( Avx2.ShiftRightLogical(nextLeft.AsUInt64(), 60), Avx2.ShiftLeftLogical(nextRight.AsUInt64(), 60))).AsByte(); Avx2.Store(lower2 + x, lowerSum2); Vector256 <byte> lowerSum3 = Avx2.Add(lowerSum2, nextCenter); Avx2.Store(lower2 + x, lowerSum2); Avx2.Store(lower3 + x, Avx2.Add(lowerSum2, nextCenter)); Vector256 <byte> neighbours = Avx2.Add( Avx2.LoadVector256(middle2 + x), Avx2.Add(Avx2.LoadVector256(upper3 + x), lowerSum3)); Vector256 <byte> alive = Avx2.LoadVector256(nextLinePtr - LINE_WIDTH + x); alive = Avx2.ShiftLeftLogical(alive.AsUInt64(), (byte)3).AsByte(); Vector256 <byte> mask = Avx2.Or(neighbours, alive); Vector256 <byte> mask_hi = Avx2.And(mask, v_hi); Vector256 <byte> mask_lo = Avx2.And(mask, v_lo); mask_hi = Avx2.ShiftRightLogical(mask_hi.AsUInt64(), 4).AsByte(); Vector256 <byte> shouldBeAlive_hi = Avx2.Shuffle(v_lookup, mask_hi); Vector256 <byte> shouldBeAlive_lo = Avx2.Shuffle(v_lookup, mask_lo); shouldBeAlive_hi = Avx2.ShiftLeftLogical(shouldBeAlive_hi.AsUInt64(), 4).AsByte(); Vector256 <byte> shouldBeAlive = Avx2.Or(shouldBeAlive_hi, shouldBeAlive_lo); Avx2.Store(nextLinePtr - LINE_WIDTH + x, shouldBeAlive); } *(byte *)(nextLinePtr - LINE_WIDTH) &= 0xF0; *(byte *)(nextLinePtr - 1) &= 0x0F; } } }
public static Vector256 <T> AndNot_Software <T>(Vector256 <T> left, Vector256 <T> right) where T : struct { return(AndNot_Software(left.AsUInt64(), right.AsUInt64()).As <ulong, T>()); }
public static Vector256 <T> Not_Software <T>(Vector256 <T> vector) where T : struct { return(Not_Software(vector.AsUInt64()).As <ulong, T>()); }
public static unsafe void ChaCha20(uint *x, byte *m, byte *c, ulong bytes) { if (Avx2.IsSupported && bytes >= 512) { Vector256 <uint> x_0 = Vector256.Create(x[0]); Vector256 <uint> x_1 = Vector256.Create(x[1]); Vector256 <uint> x_2 = Vector256.Create(x[2]); Vector256 <uint> x_3 = Vector256.Create(x[3]); Vector256 <uint> x_4 = Vector256.Create(x[4]); Vector256 <uint> x_5 = Vector256.Create(x[5]); Vector256 <uint> x_6 = Vector256.Create(x[6]); Vector256 <uint> x_7 = Vector256.Create(x[7]); Vector256 <uint> x_8 = Vector256.Create(x[8]); Vector256 <uint> x_9 = Vector256.Create(x[9]); Vector256 <uint> x_10 = Vector256.Create(x[10]); Vector256 <uint> x_11 = Vector256.Create(x[11]); Vector256 <uint> x_12; Vector256 <uint> x_13; Vector256 <uint> x_14 = Vector256.Create(x[14]); Vector256 <uint> x_15 = Vector256.Create(x[15]); Vector256 <uint> orig0 = x_0; Vector256 <uint> orig1 = x_1; Vector256 <uint> orig2 = x_2; Vector256 <uint> orig3 = x_3; Vector256 <uint> orig4 = x_4; Vector256 <uint> orig5 = x_5; Vector256 <uint> orig6 = x_6; Vector256 <uint> orig7 = x_7; Vector256 <uint> orig8 = x_8; Vector256 <uint> orig9 = x_9; Vector256 <uint> orig10 = x_10; Vector256 <uint> orig11 = x_11; Vector256 <uint> orig12; Vector256 <uint> orig13; Vector256 <uint> orig14 = x_14; Vector256 <uint> orig15 = x_15; while (bytes >= 512) { Vector256 <uint> addv12 = Vector256.Create(0, 1, 2, 3).AsUInt32(); Vector256 <uint> addv13 = Vector256.Create(4, 5, 6, 7).AsUInt32(); Vector256 <uint> permute = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7).AsUInt32(); Vector256 <uint> t12, t13; x_0 = orig0; x_1 = orig1; x_2 = orig2; x_3 = orig3; x_4 = orig4; x_5 = orig5; x_6 = orig6; x_7 = orig7; x_8 = orig8; x_9 = orig9; x_10 = orig10; x_11 = orig11; x_14 = orig14; x_15 = orig15; uint in12 = x[12]; uint in13 = x[13]; ulong in1213 = in12 | ((ulong)in13 << 32); x_12 = x_13 = Avx2.BroadcastScalarToVector256(Sse2.X64.ConvertScalarToVector128UInt64(in1213)).AsUInt32(); t12 = Avx2.Add(addv12.AsUInt64(), x_12.AsUInt64()).AsUInt32(); t13 = Avx2.Add(addv13.AsUInt64(), x_13.AsUInt64()).AsUInt32(); x_12 = Avx2.UnpackLow(t12, t13); x_13 = Avx2.UnpackHigh(t12, t13); t12 = Avx2.UnpackLow(x_12, x_13); t13 = Avx2.UnpackHigh(x_12, x_13); x_12 = Avx2.PermuteVar8x32(t12, permute); x_13 = Avx2.PermuteVar8x32(t13, permute); orig12 = x_12; orig13 = x_13; in1213 += 8; x[12] = (uint)(in1213 & 0xFFFFFFFF); x[13] = (uint)((in1213 >> 32) & 0xFFFFFFFF); for (int i = 0; i < 20; i += 2) { Vec256Round(ref x_0, ref x_4, ref x_8, ref x_12, ref x_1, ref x_5, ref x_9, ref x_13, ref x_2, ref x_6, ref x_10, ref x_14, ref x_3, ref x_7, ref x_11, ref x_15); Vec256Round(ref x_0, ref x_5, ref x_10, ref x_15, ref x_1, ref x_6, ref x_11, ref x_12, ref x_2, ref x_7, ref x_8, ref x_13, ref x_3, ref x_4, ref x_9, ref x_14); } Vector256 <uint> t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, t_13, t_14, t_15; t_0 = t_1 = t_2 = t_3 = t_4 = t_5 = t_6 = t_7 = t_8 = t_9 = t_10 = t_11 = t_12 = t_13 = t_14 = t_15 = Vector256.Create((uint)0); // ONEOCTO enter OneQuadUnpack(ref x_0, ref x_1, ref x_2, ref x_3, ref t_0, ref t_1, ref t_2, ref t_3, ref orig0, ref orig1, ref orig2, ref orig3); OneQuadUnpack(ref x_4, ref x_5, ref x_6, ref x_7, ref t_4, ref t_5, ref t_6, ref t_7, ref orig4, ref orig5, ref orig6, ref orig7); t_0 = Avx2.Permute2x128(x_0, x_4, 0x20); t_4 = Avx2.Permute2x128(x_0, x_4, 0x31); t_1 = Avx2.Permute2x128(x_1, x_5, 0x20); t_5 = Avx2.Permute2x128(x_1, x_5, 0x31); t_2 = Avx2.Permute2x128(x_2, x_6, 0x20); t_6 = Avx2.Permute2x128(x_2, x_6, 0x31); t_3 = Avx2.Permute2x128(x_3, x_7, 0x20); t_7 = Avx2.Permute2x128(x_3, x_7, 0x31); t_0 = Avx2.Xor(t_0, Avx.LoadVector256(m).AsUInt32()); t_1 = Avx2.Xor(t_1, Avx.LoadVector256(m + 64).AsUInt32()); t_2 = Avx2.Xor(t_2, Avx.LoadVector256(m + 128).AsUInt32()); t_3 = Avx2.Xor(t_3, Avx.LoadVector256(m + 192).AsUInt32()); t_4 = Avx2.Xor(t_4, Avx.LoadVector256(m + 256).AsUInt32()); t_5 = Avx2.Xor(t_5, Avx.LoadVector256(m + 320).AsUInt32()); t_6 = Avx2.Xor(t_6, Avx.LoadVector256(m + 384).AsUInt32()); t_7 = Avx2.Xor(t_7, Avx.LoadVector256(m + 448).AsUInt32()); Avx.Store(c, t_0.AsByte()); Avx.Store(c + 64, t_1.AsByte()); Avx.Store(c + 128, t_2.AsByte()); Avx.Store(c + 192, t_3.AsByte()); Avx.Store(c + 256, t_4.AsByte()); Avx.Store(c + 320, t_5.AsByte()); Avx.Store(c + 384, t_6.AsByte()); Avx.Store(c + 448, t_7.AsByte()); // ONEOCTO exit m += 32; c += 32; // ONEOCTO enter OneQuadUnpack(ref x_8, ref x_9, ref x_10, ref x_11, ref t_8, ref t_9, ref t_10, ref t_11, ref orig8, ref orig9, ref orig10, ref orig11); OneQuadUnpack(ref x_12, ref x_13, ref x_14, ref x_15, ref t_12, ref t_13, ref t_14, ref t_15, ref orig12, ref orig13, ref orig14, ref orig15); t_8 = Avx2.Permute2x128(x_8, x_12, 0x20); t_12 = Avx2.Permute2x128(x_8, x_12, 0x31); t_9 = Avx2.Permute2x128(x_9, x_13, 0x20); t_13 = Avx2.Permute2x128(x_9, x_13, 0x31); t_10 = Avx2.Permute2x128(x_10, x_14, 0x20); t_14 = Avx2.Permute2x128(x_10, x_14, 0x31); t_11 = Avx2.Permute2x128(x_11, x_15, 0x20); t_15 = Avx2.Permute2x128(x_11, x_15, 0x31); t_8 = Avx2.Xor(t_8, Avx.LoadVector256(m).AsUInt32()); t_9 = Avx2.Xor(t_9, Avx.LoadVector256(m + 64).AsUInt32()); t_10 = Avx2.Xor(t_10, Avx.LoadVector256(m + 128).AsUInt32()); t_11 = Avx2.Xor(t_11, Avx.LoadVector256(m + 192).AsUInt32()); t_12 = Avx2.Xor(t_12, Avx.LoadVector256(m + 256).AsUInt32()); t_13 = Avx2.Xor(t_13, Avx.LoadVector256(m + 320).AsUInt32()); t_14 = Avx2.Xor(t_14, Avx.LoadVector256(m + 384).AsUInt32()); t_15 = Avx2.Xor(t_15, Avx.LoadVector256(m + 448).AsUInt32()); Avx.Store(c, t_8.AsByte()); Avx.Store(c + 64, t_9.AsByte()); Avx.Store(c + 128, t_10.AsByte()); Avx.Store(c + 192, t_11.AsByte()); Avx.Store(c + 256, t_12.AsByte()); Avx.Store(c + 320, t_13.AsByte()); Avx.Store(c + 384, t_14.AsByte()); Avx.Store(c + 448, t_15.AsByte()); // ONEOCTO exit m -= 32; c -= 32; bytes -= 512; c += 512; m += 512; } } if (bytes >= 256) { Vector128 <uint> x_0 = Vector128.Create(x[0]); Vector128 <uint> x_1 = Vector128.Create(x[1]); Vector128 <uint> x_2 = Vector128.Create(x[2]); Vector128 <uint> x_3 = Vector128.Create(x[3]); Vector128 <uint> x_4 = Vector128.Create(x[4]); Vector128 <uint> x_5 = Vector128.Create(x[5]); Vector128 <uint> x_6 = Vector128.Create(x[6]); Vector128 <uint> x_7 = Vector128.Create(x[7]); Vector128 <uint> x_8 = Vector128.Create(x[8]); Vector128 <uint> x_9 = Vector128.Create(x[9]); Vector128 <uint> x_10 = Vector128.Create(x[10]); Vector128 <uint> x_11 = Vector128.Create(x[11]); Vector128 <uint> x_12; Vector128 <uint> x_13; Vector128 <uint> x_14 = Vector128.Create(x[14]); Vector128 <uint> x_15 = Vector128.Create(x[15]); Vector128 <uint> orig0 = x_0; Vector128 <uint> orig1 = x_1; Vector128 <uint> orig2 = x_2; Vector128 <uint> orig3 = x_3; Vector128 <uint> orig4 = x_4; Vector128 <uint> orig5 = x_5; Vector128 <uint> orig6 = x_6; Vector128 <uint> orig7 = x_7; Vector128 <uint> orig8 = x_8; Vector128 <uint> orig9 = x_9; Vector128 <uint> orig10 = x_10; Vector128 <uint> orig11 = x_11; Vector128 <uint> orig12; Vector128 <uint> orig13; Vector128 <uint> orig14 = x_14; Vector128 <uint> orig15 = x_15; Vector128 <uint> t12, t13; while (bytes >= 256) { Vector128 <uint> addv12 = Vector128.Create(0, 1).AsUInt32(); Vector128 <uint> addv13 = Vector128.Create(2, 3).AsUInt32(); x_0 = orig0; x_1 = orig1; x_2 = orig2; x_3 = orig3; x_4 = orig4; x_5 = orig5; x_6 = orig6; x_7 = orig7; x_8 = orig8; x_9 = orig9; x_10 = orig10; x_11 = orig11; x_14 = orig14; x_15 = orig15; uint in12 = x[12]; uint in13 = x[13]; ulong in1213 = in12 | ((ulong)in13) << 32; t12 = Vector128.Create(in1213).AsUInt32(); t13 = Vector128.Create(in1213).AsUInt32(); x_12 = Sse2.Add(Vector128.AsUInt64 <uint>(addv12), Vector128.AsUInt64 <uint>(t12)).AsUInt32(); x_13 = Sse2.Add(Vector128.AsUInt64 <uint>(addv13), Vector128.AsUInt64 <uint>(t13)).AsUInt32(); t12 = Sse2.UnpackLow(x_12, x_13); t13 = Sse2.UnpackHigh(x_12, x_13); x_12 = Sse2.UnpackLow(t12, t13); x_13 = Sse2.UnpackHigh(t12, t13); orig12 = x_12; orig13 = x_13; in1213 += 4; x[12] = (uint)(in1213 & 0xFFFFFFFF); x[13] = (uint)(in1213 >> 32 & 0xFFFFFFFF); for (int i = 0; i < 20; i += 2) { Vec128QuarterRound(ref x_0, ref x_4, ref x_8, ref x_12); Vec128QuarterRound(ref x_1, ref x_5, ref x_9, ref x_13); Vec128QuarterRound(ref x_2, ref x_6, ref x_10, ref x_14); Vec128QuarterRound(ref x_3, ref x_7, ref x_11, ref x_15); Vec128QuarterRound(ref x_0, ref x_5, ref x_10, ref x_15); Vec128QuarterRound(ref x_1, ref x_6, ref x_11, ref x_12); Vec128QuarterRound(ref x_2, ref x_7, ref x_8, ref x_13); Vec128QuarterRound(ref x_3, ref x_4, ref x_9, ref x_14); } OneQuad(ref x_0, ref x_1, ref x_2, ref x_3, ref orig0, ref orig1, ref orig2, ref orig3, m, c); m += 16; c += 16; OneQuad(ref x_4, ref x_5, ref x_6, ref x_7, ref orig4, ref orig5, ref orig6, ref orig7, m, c); m += 16; c += 16; OneQuad(ref x_8, ref x_9, ref x_10, ref x_11, ref orig8, ref orig9, ref orig10, ref orig11, m, c); m += 16; c += 16; OneQuad(ref x_12, ref x_13, ref x_14, ref x_15, ref orig12, ref orig13, ref orig14, ref orig15, m, c); m -= 48; c -= 48; bytes -= 256; c += 256; m += 256; } } while (bytes >= 64) { Vector128 <uint> x_0 = Sse2.LoadVector128(x); Vector128 <uint> x_1 = Sse2.LoadVector128(x + 4); Vector128 <uint> x_2 = Sse2.LoadVector128(x + 8); Vector128 <uint> x_3 = Sse2.LoadVector128(x + 12); Vector128 <uint> t_1; for (int i = 0; i < 20; i += 2) { x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_1 = Sse2.Xor(x_1, x_2); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 12); t_1 = Sse2.ShiftRightLogical(t_1, 20); x_1 = Sse2.Xor(x_1, t_1); x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_0 = Sse2.Shuffle(x_0, 147); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_3 = Sse2.Shuffle(x_3, 78); x_1 = Sse2.Xor(x_1, x_2); x_2 = Sse2.Shuffle(x_2, 57); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 7); t_1 = Sse2.ShiftRightLogical(t_1, 25); x_1 = Sse2.Xor(x_1, t_1); x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_1 = Sse2.Xor(x_1, x_2); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 12); t_1 = Sse2.ShiftRightLogical(t_1, 20); x_1 = Sse2.Xor(x_1, t_1); x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_0 = Sse2.Shuffle(x_0, 57); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_3 = Sse2.Shuffle(x_3, 78); x_1 = Sse2.Xor(x_1, x_2); x_2 = Sse2.Shuffle(x_2, 147); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 7); t_1 = Sse2.ShiftRightLogical(t_1, 25); x_1 = Sse2.Xor(x_1, t_1); } x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x)); x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4)); x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8)); x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12)); x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32(); x_2 = Sse2.Xor(x_2.AsByte(), Sse2.LoadVector128(m + 32)).AsUInt32(); x_3 = Sse2.Xor(x_3.AsByte(), Sse2.LoadVector128(m + 48)).AsUInt32(); Sse2.Store(c, x_0.AsByte()); Sse2.Store(c + 16, x_1.AsByte()); Sse2.Store(c + 32, x_2.AsByte()); Sse2.Store(c + 48, x_3.AsByte()); uint in12 = x[12]; uint in13 = x[13]; in12++; if (in12 == 0) { in13++; } x[12] = in12; x[13] = in13; bytes -= 64; c += 64; m += 64; } if (bytes > 0) { Vector128 <uint> x_0 = Sse2.LoadVector128(x); Vector128 <uint> x_1 = Sse2.LoadVector128(x + 4); Vector128 <uint> x_2 = Sse2.LoadVector128(x + 8); Vector128 <uint> x_3 = Sse2.LoadVector128(x + 12); Vector128 <uint> t_1; for (int i = 0; i < 20; i += 2) { x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_1 = Sse2.Xor(x_1, x_2); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 12); t_1 = Sse2.ShiftRightLogical(t_1, 20); x_1 = Sse2.Xor(x_1, t_1); x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_0 = Sse2.Shuffle(x_0, 0x93); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_3 = Sse2.Shuffle(x_3, 0x4e); x_1 = Sse2.Xor(x_1, x_2); x_2 = Sse2.Shuffle(x_2, 0x39); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 7); t_1 = Sse2.ShiftRightLogical(t_1, 25); x_1 = Sse2.Xor(x_1, t_1); x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_1 = Sse2.Xor(x_1, x_2); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 12); t_1 = Sse2.ShiftRightLogical(t_1, 20); x_1 = Sse2.Xor(x_1, t_1); x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_0 = Sse2.Shuffle(x_0, 0x39); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_3 = Sse2.Shuffle(x_3, 0x4e); x_1 = Sse2.Xor(x_1, x_2); x_2 = Sse2.Shuffle(x_2, 0x93); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 7); t_1 = Sse2.ShiftRightLogical(t_1, 25); x_1 = Sse2.Xor(x_1, t_1); } x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x)); x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4)); x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8)); x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12)); byte *partialblock = stackalloc byte[64]; Sse2.Store(partialblock, Vector128.AsByte(x_0)); Sse2.Store(partialblock + 16, Vector128.AsByte(x_1)); Sse2.Store(partialblock + 32, Vector128.AsByte(x_2)); Sse2.Store(partialblock + 48, Vector128.AsByte(x_3)); for (ulong i = 0; i < bytes; i++) { c[i] = (byte)(m[i] ^ partialblock[i]); } for (int n = 0; n < 64 / sizeof(int); n++) { ((int *)partialblock)[n] = 0; } } }
public override void Step() { fixed(byte * currentFieldPtr = field, upperLineSumOf2Ptr = upperLineSumOf2, upperLineSumOf3Ptr = upperLineSumOf3, middleLineSumOf2Ptr = middleLineSumOf2, middleLineSumOf3Ptr = middleLineSumOf3, lowerLineSumOf2Ptr = lowerLineSumOf2, lowerLineSumOf3Ptr = lowerLineSumOf3) { byte *upper2 = upperLineSumOf2Ptr, upper3 = upperLineSumOf3Ptr; byte *middle2 = middleLineSumOf2Ptr, middle3 = middleLineSumOf3Ptr; byte *lower2 = lowerLineSumOf2Ptr, lower3 = lowerLineSumOf3Ptr; byte *nextLinePtr = currentFieldPtr + WIDTH; for (int x = 0; x < WIDTH; x += 32) { Avx2.Store(upper2 + x, Vector256 <byte> .Zero); Avx2.Store(upper3 + x, Vector256 <byte> .Zero); Avx2.Store(middle2 + x, Vector256 <byte> .Zero); Avx2.Store(middle3 + x, Vector256 <byte> .Zero); Vector256 <byte> sum2 = Avx2.Add(Avx2.LoadVector256(nextLinePtr + x - 1), Avx2.LoadVector256(nextLinePtr + x + 1)); Avx2.Store(lower2 + x, sum2); Avx2.Store(lower3 + x, Avx2.Add(sum2, Avx2.LoadVector256(nextLinePtr + x))); } for (int y = 1; y < HEIGHT - 1; y++) { nextLinePtr += WIDTH; byte *temp2 = upper2; byte *temp3 = upper3; upper2 = middle2; upper3 = middle3; middle2 = lower2; middle3 = lower3; lower2 = temp2; lower3 = temp3; for (int x = 0; x < WIDTH; x += 32) { Vector256 <byte> left = Avx2.LoadVector256(nextLinePtr + x - 1); Vector256 <byte> center = Avx2.LoadVector256(nextLinePtr + x); Vector256 <byte> right = Avx2.LoadVector256(nextLinePtr + x + 1); Vector256 <byte> lowerSum2 = Avx2.Add(left, right); Vector256 <byte> lowerSum3 = Avx2.Add(lowerSum2, center); Avx2.Store(lower2 + x, lowerSum2); Avx2.Store(lower3 + x, Avx2.Add(lowerSum2, center)); Vector256 <byte> neighbours = Avx2.Add( Avx2.LoadVector256(middle2 + x), Avx2.Add(Avx2.LoadVector256(upper3 + x), lowerSum3)); Vector256 <byte> alive = Avx2.LoadVector256(nextLinePtr - WIDTH + x); //Avx2.Subtract(Avx2.LoadVector256(middle3 + x), Avx2.LoadVector256(middle2 + x)); alive = Avx2.ShiftLeftLogical(alive.AsUInt64(), (byte)3).AsByte(); Vector256 <byte> mask = Avx2.Or(neighbours, alive); Vector256 <byte> shouldBeAlive = Avx2.Shuffle(v_lookup, mask); Avx2.Store(nextLinePtr - WIDTH + x, shouldBeAlive); } *(byte *)(nextLinePtr - WIDTH) = 0; *(byte *)(nextLinePtr - 1) = 0; } } }
public unsafe void Test_AVX_BitsToBytesCompressed() { ulong x = 0b0000_0001__0010_0011__0100_0101__0110_0111____1000_1001__1010_1011__1100_1101__1110_1111ul; Vector256 <byte> mask1, mask2, mask3, zero = Vector256 <byte> .Zero, one, ff, low4, hi4, restore_mask1, restore_mask2; byte[] mask1_bytes = new byte[] { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, }; byte[] mask2_bytes = new byte[] { 0x01, 0x04, 0x10, 0x40, 0x01, 0x04, 0x10, 0x40, 0x01, 0x04, 0x10, 0x40, 0x01, 0x04, 0x10, 0x40, 0x01, 0x04, 0x10, 0x40, 0x01, 0x04, 0x10, 0x40, 0x01, 0x04, 0x10, 0x40, 0x01, 0x04, 0x10, 0x40, }; byte[] mask3_bytes = new byte[] { 0x02, 0x08, 0x20, 0x80, 0x02, 0x08, 0x20, 0x80, 0x02, 0x08, 0x20, 0x80, 0x02, 0x08, 0x20, 0x80, 0x02, 0x08, 0x20, 0x80, 0x02, 0x08, 0x20, 0x80, 0x02, 0x08, 0x20, 0x80, 0x02, 0x08, 0x20, 0x80, }; byte[] restore_mask1_bytes = new byte[] { 0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, }; byte[] restore_mask2_bytes = new byte[] { 8, 255, 9, 255, 10, 255, 11, 255, 12, 255, 13, 255, 14, 255, 15, 255, 8, 255, 9, 255, 10, 255, 11, 255, 12, 255, 13, 255, 14, 255, 15, 255, }; fixed(byte *ptr = mask1_bytes) mask1 = Avx2.LoadVector256(ptr); fixed(byte *ptr = mask2_bytes) mask2 = Avx2.LoadVector256(ptr); fixed(byte *ptr = mask3_bytes) mask3 = Avx2.LoadVector256(ptr); fixed(byte *ptr = restore_mask1_bytes) restore_mask1 = Avx2.LoadVector256(ptr); fixed(byte *ptr = restore_mask2_bytes) restore_mask2 = Avx2.LoadVector256(ptr); byte one_byte = 1; one = Avx2.BroadcastScalarToVector256(&one_byte); byte ff_byte = 0xff; ff = Avx2.BroadcastScalarToVector256(&ff_byte); byte lo4_byte = 0x0f; low4 = Avx2.BroadcastScalarToVector256(&lo4_byte); byte hi4_byte = 0xf0; hi4 = Avx2.BroadcastScalarToVector256(&hi4_byte); // ***** load **** // Vector256 <byte> v = Avx2.BroadcastScalarToVector256(&x).AsByte(); v = Avx2.Shuffle(v, mask1); Vector256 <byte> v1 = Avx2.And(v, mask2); v1 = Avx2.Min(v1, one); Vector256 <byte> v2 = Avx2.And(v, mask3); v2 = Avx2.Min(v2, one); v2 = Avx2.ShiftLeftLogical(v2.AsUInt64(), 4).AsByte(); v = Avx2.Or(v1, v2); // ***** restore **** // v1 = Avx2.And(v, low4); v2 = Avx2.And(v, hi4); v1 = Avx2.CompareEqual(v1, zero); v1 = Avx2.AndNot(v1, ff); v2 = Avx2.CompareEqual(v2, zero); v2 = Avx2.AndNot(v2, ff); Vector256 <byte> r1 = Avx2.Shuffle(v1, restore_mask1); Vector256 <byte> r2 = Avx2.Shuffle(v1, restore_mask2); Vector256 <byte> r3 = Avx2.Shuffle(v2, restore_mask1); Vector256 <byte> r4 = Avx2.Shuffle(v2, restore_mask2); ulong restored1 = (uint)Avx2.MoveMask(r1); ulong restored2 = (uint)Avx2.MoveMask(r2); ulong restored3 = (uint)Avx2.MoveMask(r3); ulong restored4 = (uint)Avx2.MoveMask(r4); ulong restored_lo = restored1 | (restored3 << 1); ulong restored_hi = restored2 | (restored4 << 1); ulong restored = (restored_lo & 0xFFFF) | (restored_hi & 0xFFFF) << 16 | (restored_lo & 0xFFFF0000) << 16 | (restored_hi & 0xFFFF0000) << 32; Assert.AreEqual(x, restored); }