public static Vector256 <T> Xor_Software <T>(Vector256 <T> left, Vector256 <T> right) where T : struct
 => Xor_Software(left.AsUInt64(), right.AsUInt64()).As <ulong, T>();
Exemple #2
0
        private unsafe static void Xxh3Accumulate512(Span <ulong> acc, ReadOnlySpan <byte> input, ReadOnlySpan <byte> secret)
        {
            if (Avx2.IsSupported)
            {
                fixed(ulong *pAcc = acc)
                {
                    fixed(byte *pInput = input, pSecret = secret)
                    {
                        Vector256 <ulong> *xAcc    = (Vector256 <ulong> *)pAcc;
                        Vector256 <byte> * xInput  = (Vector256 <byte> *)pInput;
                        Vector256 <byte> * xSecret = (Vector256 <byte> *)pSecret;

                        for (ulong i = 0; i < StripeLen / 32; i++)
                        {
                            Vector256 <byte>  dataVec   = xInput[i];
                            Vector256 <byte>  keyVec    = xSecret[i];
                            Vector256 <byte>  dataKey   = Avx2.Xor(dataVec, keyVec);
                            Vector256 <uint>  dataKeyLo = Avx2.Shuffle(dataKey.AsUInt32(), 0b00110001);
                            Vector256 <ulong> product   = Avx2.Multiply(dataKey.AsUInt32(), dataKeyLo);
                            Vector256 <uint>  dataSwap  = Avx2.Shuffle(dataVec.AsUInt32(), 0b01001110);
                            Vector256 <ulong> sum       = Avx2.Add(xAcc[i], dataSwap.AsUInt64());
                            xAcc[i] = Avx2.Add(product, sum);
                        }
                    }
                }
            }
            else if (Sse2.IsSupported)
            {
                fixed(ulong *pAcc = acc)
                {
                    fixed(byte *pInput = input, pSecret = secret)
                    {
                        Vector128 <ulong> *xAcc    = (Vector128 <ulong> *)pAcc;
                        Vector128 <byte> * xInput  = (Vector128 <byte> *)pInput;
                        Vector128 <byte> * xSecret = (Vector128 <byte> *)pSecret;

                        for (ulong i = 0; i < StripeLen / 16; i++)
                        {
                            Vector128 <byte>  dataVec   = xInput[i];
                            Vector128 <byte>  keyVec    = xSecret[i];
                            Vector128 <byte>  dataKey   = Sse2.Xor(dataVec, keyVec);
                            Vector128 <uint>  dataKeyLo = Sse2.Shuffle(dataKey.AsUInt32(), 0b00110001);
                            Vector128 <ulong> product   = Sse2.Multiply(dataKey.AsUInt32(), dataKeyLo);
                            Vector128 <uint>  dataSwap  = Sse2.Shuffle(dataVec.AsUInt32(), 0b01001110);
                            Vector128 <ulong> sum       = Sse2.Add(xAcc[i], dataSwap.AsUInt64());
                            xAcc[i] = Sse2.Add(product, sum);
                        }
                    }
                }
            }
            else
            {
                for (int i = 0; i < AccNb; i++)
                {
                    ulong dataVal = BinaryPrimitives.ReadUInt64LittleEndian(input.Slice(i * sizeof(ulong)));
                    ulong dataKey = dataVal ^ BinaryPrimitives.ReadUInt64LittleEndian(secret.Slice(i * sizeof(ulong)));
                    acc[i ^ 1] += dataVal;
                    acc[i]     += Mult32To64((uint)dataKey, dataKey >> 32);
                }
            }
        }
        public override void Step()
        {
            fixed(byte *
                  currentFieldPtr     = field,
                  upperLineSumOf2Ptr  = upperLineSumOf2,
                  upperLineSumOf3Ptr  = upperLineSumOf3,
                  middleLineSumOf2Ptr = middleLineSumOf2,
                  middleLineSumOf3Ptr = middleLineSumOf3,
                  lowerLineSumOf2Ptr  = lowerLineSumOf2,
                  lowerLineSumOf3Ptr  = lowerLineSumOf3)
            {
                byte *upper2 = upperLineSumOf2Ptr, upper3 = upperLineSumOf3Ptr;
                byte *middle2 = middleLineSumOf2Ptr, middle3 = middleLineSumOf3Ptr;
                byte *lower2 = lowerLineSumOf2Ptr, lower3 = lowerLineSumOf3Ptr;
                byte *nextLinePtr = currentFieldPtr + LINE_WIDTH;

                for (int x = 0; x < LINE_WIDTH; x += 32)
                {
                    Avx2.Store(upper2 + x, Vector256 <byte> .Zero);
                    Avx2.Store(upper3 + x, Vector256 <byte> .Zero);
                    Avx2.Store(middle2 + x, Vector256 <byte> .Zero);
                    Avx2.Store(middle3 + x, Vector256 <byte> .Zero);
                    Vector256 <byte> nextLeft   = Avx2.LoadVector256(nextLinePtr + x - 8);
                    Vector256 <byte> nextCenter = Avx2.LoadVector256(nextLinePtr + x);
                    Vector256 <byte> nextRight  = Avx2.LoadVector256(nextLinePtr + x + 8);
                    Vector256 <byte> lowerSum2  =
                        Avx2.Add(
                            Avx2.Add(
                                Avx2.ShiftRightLogical(nextCenter.AsUInt64(), 4), Avx2.ShiftLeftLogical(nextCenter.AsUInt64(), 4)),
                            Avx2.Add(
                                Avx2.ShiftRightLogical(nextLeft.AsUInt64(), 60), Avx2.ShiftLeftLogical(nextRight.AsUInt64(), 60))).AsByte();
                    Avx2.Store(lower2 + x, lowerSum2);
                    Vector256 <byte> lowerSum3 = Avx2.Add(lowerSum2, nextCenter);
                    Avx2.Store(lower3 + x, lowerSum3);
                }

                for (int y = 1; y < HEIGHT - 1; y++)
                {
                    nextLinePtr += LINE_WIDTH;

                    byte *temp2 = upper2;
                    byte *temp3 = upper3;
                    upper2  = middle2;
                    upper3  = middle3;
                    middle2 = lower2;
                    middle3 = lower3;
                    lower2  = temp2;
                    lower3  = temp3;

                    for (int x = 0; x < LINE_WIDTH; x += 32)
                    {
                        Vector256 <byte> nextLeft   = Avx2.LoadVector256(nextLinePtr + x - 8);
                        Vector256 <byte> nextCenter = Avx2.LoadVector256(nextLinePtr + x);
                        Vector256 <byte> nextRight  = Avx2.LoadVector256(nextLinePtr + x + 8);
                        Vector256 <byte> lowerSum2  =
                            Avx2.Add(
                                Avx2.Add(
                                    Avx2.ShiftRightLogical(nextCenter.AsUInt64(), 4), Avx2.ShiftLeftLogical(nextCenter.AsUInt64(), 4)),
                                Avx2.Add(
                                    Avx2.ShiftRightLogical(nextLeft.AsUInt64(), 60), Avx2.ShiftLeftLogical(nextRight.AsUInt64(), 60))).AsByte();
                        Avx2.Store(lower2 + x, lowerSum2);
                        Vector256 <byte> lowerSum3 = Avx2.Add(lowerSum2, nextCenter);
                        Avx2.Store(lower2 + x, lowerSum2);
                        Avx2.Store(lower3 + x, Avx2.Add(lowerSum2, nextCenter));

                        Vector256 <byte> neighbours =
                            Avx2.Add(
                                Avx2.LoadVector256(middle2 + x),
                                Avx2.Add(Avx2.LoadVector256(upper3 + x), lowerSum3));
                        Vector256 <byte> alive = Avx2.LoadVector256(nextLinePtr - LINE_WIDTH + x);

                        alive = Avx2.ShiftLeftLogical(alive.AsUInt64(), (byte)3).AsByte();
                        Vector256 <byte> mask = Avx2.Or(neighbours, alive);

                        Vector256 <byte> mask_hi = Avx2.And(mask, v_hi);
                        Vector256 <byte> mask_lo = Avx2.And(mask, v_lo);
                        mask_hi = Avx2.ShiftRightLogical(mask_hi.AsUInt64(), 4).AsByte();
                        Vector256 <byte> shouldBeAlive_hi = Avx2.Shuffle(v_lookup, mask_hi);
                        Vector256 <byte> shouldBeAlive_lo = Avx2.Shuffle(v_lookup, mask_lo);
                        shouldBeAlive_hi = Avx2.ShiftLeftLogical(shouldBeAlive_hi.AsUInt64(), 4).AsByte();

                        Vector256 <byte> shouldBeAlive = Avx2.Or(shouldBeAlive_hi, shouldBeAlive_lo);

                        Avx2.Store(nextLinePtr - LINE_WIDTH + x, shouldBeAlive);
                    }
                    *(byte *)(nextLinePtr - LINE_WIDTH) &= 0xF0;
                    *(byte *)(nextLinePtr - 1)          &= 0x0F;
                }
            }
        }
Exemple #4
0
 public static Vector256 <T> AndNot_Software <T>(Vector256 <T> left, Vector256 <T> right) where T : struct
 {
     return(AndNot_Software(left.AsUInt64(), right.AsUInt64()).As <ulong, T>());
 }
Exemple #5
0
 public static Vector256 <T> Not_Software <T>(Vector256 <T> vector) where T : struct
 {
     return(Not_Software(vector.AsUInt64()).As <ulong, T>());
 }
Exemple #6
0
        public static unsafe void ChaCha20(uint *x, byte *m, byte *c, ulong bytes)
        {
            if (Avx2.IsSupported && bytes >= 512)
            {
                Vector256 <uint> x_0  = Vector256.Create(x[0]);
                Vector256 <uint> x_1  = Vector256.Create(x[1]);
                Vector256 <uint> x_2  = Vector256.Create(x[2]);
                Vector256 <uint> x_3  = Vector256.Create(x[3]);
                Vector256 <uint> x_4  = Vector256.Create(x[4]);
                Vector256 <uint> x_5  = Vector256.Create(x[5]);
                Vector256 <uint> x_6  = Vector256.Create(x[6]);
                Vector256 <uint> x_7  = Vector256.Create(x[7]);
                Vector256 <uint> x_8  = Vector256.Create(x[8]);
                Vector256 <uint> x_9  = Vector256.Create(x[9]);
                Vector256 <uint> x_10 = Vector256.Create(x[10]);
                Vector256 <uint> x_11 = Vector256.Create(x[11]);
                Vector256 <uint> x_12;
                Vector256 <uint> x_13;
                Vector256 <uint> x_14 = Vector256.Create(x[14]);
                Vector256 <uint> x_15 = Vector256.Create(x[15]);

                Vector256 <uint> orig0  = x_0;
                Vector256 <uint> orig1  = x_1;
                Vector256 <uint> orig2  = x_2;
                Vector256 <uint> orig3  = x_3;
                Vector256 <uint> orig4  = x_4;
                Vector256 <uint> orig5  = x_5;
                Vector256 <uint> orig6  = x_6;
                Vector256 <uint> orig7  = x_7;
                Vector256 <uint> orig8  = x_8;
                Vector256 <uint> orig9  = x_9;
                Vector256 <uint> orig10 = x_10;
                Vector256 <uint> orig11 = x_11;
                Vector256 <uint> orig12;
                Vector256 <uint> orig13;
                Vector256 <uint> orig14 = x_14;
                Vector256 <uint> orig15 = x_15;

                while (bytes >= 512)
                {
                    Vector256 <uint> addv12 = Vector256.Create(0, 1, 2, 3).AsUInt32();
                    Vector256 <uint> addv13 = Vector256.Create(4, 5, 6, 7).AsUInt32();
                    Vector256 <uint> permute = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7).AsUInt32();
                    Vector256 <uint> t12, t13;
                    x_0  = orig0;
                    x_1  = orig1;
                    x_2  = orig2;
                    x_3  = orig3;
                    x_4  = orig4;
                    x_5  = orig5;
                    x_6  = orig6;
                    x_7  = orig7;
                    x_8  = orig8;
                    x_9  = orig9;
                    x_10 = orig10;
                    x_11 = orig11;
                    x_14 = orig14;
                    x_15 = orig15;
                    uint  in12 = x[12];
                    uint  in13 = x[13];
                    ulong in1213 = in12 | ((ulong)in13 << 32);
                    x_12 = x_13 = Avx2.BroadcastScalarToVector256(Sse2.X64.ConvertScalarToVector128UInt64(in1213)).AsUInt32();
                    t12  = Avx2.Add(addv12.AsUInt64(), x_12.AsUInt64()).AsUInt32();
                    t13  = Avx2.Add(addv13.AsUInt64(), x_13.AsUInt64()).AsUInt32();
                    x_12 = Avx2.UnpackLow(t12, t13);
                    x_13 = Avx2.UnpackHigh(t12, t13);
                    t12  = Avx2.UnpackLow(x_12, x_13);
                    t13  = Avx2.UnpackHigh(x_12, x_13);
                    x_12 = Avx2.PermuteVar8x32(t12, permute);
                    x_13 = Avx2.PermuteVar8x32(t13, permute);

                    orig12 = x_12;
                    orig13 = x_13;

                    in1213 += 8;

                    x[12] = (uint)(in1213 & 0xFFFFFFFF);
                    x[13] = (uint)((in1213 >> 32) & 0xFFFFFFFF);
                    for (int i = 0; i < 20; i += 2)
                    {
                        Vec256Round(ref x_0, ref x_4, ref x_8, ref x_12, ref x_1, ref x_5, ref x_9, ref x_13, ref x_2, ref x_6, ref x_10, ref x_14, ref x_3, ref x_7, ref x_11, ref x_15);
                        Vec256Round(ref x_0, ref x_5, ref x_10, ref x_15, ref x_1, ref x_6, ref x_11, ref x_12, ref x_2, ref x_7, ref x_8, ref x_13, ref x_3, ref x_4, ref x_9, ref x_14);
                    }

                    Vector256 <uint> t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, t_13, t_14, t_15;
                    t_0 = t_1 = t_2 = t_3 = t_4 = t_5 = t_6 = t_7 = t_8 = t_9 = t_10 = t_11 = t_12 = t_13 = t_14 = t_15 = Vector256.Create((uint)0);
                    // ONEOCTO enter
                    OneQuadUnpack(ref x_0, ref x_1, ref x_2, ref x_3, ref t_0, ref t_1, ref t_2, ref t_3, ref orig0, ref orig1, ref orig2, ref orig3);
                    OneQuadUnpack(ref x_4, ref x_5, ref x_6, ref x_7, ref t_4, ref t_5, ref t_6, ref t_7, ref orig4, ref orig5, ref orig6, ref orig7);
                    t_0 = Avx2.Permute2x128(x_0, x_4, 0x20);
                    t_4 = Avx2.Permute2x128(x_0, x_4, 0x31);
                    t_1 = Avx2.Permute2x128(x_1, x_5, 0x20);
                    t_5 = Avx2.Permute2x128(x_1, x_5, 0x31);
                    t_2 = Avx2.Permute2x128(x_2, x_6, 0x20);
                    t_6 = Avx2.Permute2x128(x_2, x_6, 0x31);
                    t_3 = Avx2.Permute2x128(x_3, x_7, 0x20);
                    t_7 = Avx2.Permute2x128(x_3, x_7, 0x31);
                    t_0 = Avx2.Xor(t_0, Avx.LoadVector256(m).AsUInt32());
                    t_1 = Avx2.Xor(t_1, Avx.LoadVector256(m + 64).AsUInt32());
                    t_2 = Avx2.Xor(t_2, Avx.LoadVector256(m + 128).AsUInt32());
                    t_3 = Avx2.Xor(t_3, Avx.LoadVector256(m + 192).AsUInt32());
                    t_4 = Avx2.Xor(t_4, Avx.LoadVector256(m + 256).AsUInt32());
                    t_5 = Avx2.Xor(t_5, Avx.LoadVector256(m + 320).AsUInt32());
                    t_6 = Avx2.Xor(t_6, Avx.LoadVector256(m + 384).AsUInt32());
                    t_7 = Avx2.Xor(t_7, Avx.LoadVector256(m + 448).AsUInt32());
                    Avx.Store(c, t_0.AsByte());
                    Avx.Store(c + 64, t_1.AsByte());
                    Avx.Store(c + 128, t_2.AsByte());
                    Avx.Store(c + 192, t_3.AsByte());
                    Avx.Store(c + 256, t_4.AsByte());
                    Avx.Store(c + 320, t_5.AsByte());
                    Avx.Store(c + 384, t_6.AsByte());
                    Avx.Store(c + 448, t_7.AsByte());
                    // ONEOCTO exit

                    m += 32;
                    c += 32;

                    // ONEOCTO enter
                    OneQuadUnpack(ref x_8, ref x_9, ref x_10, ref x_11, ref t_8, ref t_9, ref t_10, ref t_11, ref orig8, ref orig9, ref orig10, ref orig11);
                    OneQuadUnpack(ref x_12, ref x_13, ref x_14, ref x_15, ref t_12, ref t_13, ref t_14, ref t_15, ref orig12, ref orig13, ref orig14, ref orig15);
                    t_8  = Avx2.Permute2x128(x_8, x_12, 0x20);
                    t_12 = Avx2.Permute2x128(x_8, x_12, 0x31);
                    t_9  = Avx2.Permute2x128(x_9, x_13, 0x20);
                    t_13 = Avx2.Permute2x128(x_9, x_13, 0x31);
                    t_10 = Avx2.Permute2x128(x_10, x_14, 0x20);
                    t_14 = Avx2.Permute2x128(x_10, x_14, 0x31);
                    t_11 = Avx2.Permute2x128(x_11, x_15, 0x20);
                    t_15 = Avx2.Permute2x128(x_11, x_15, 0x31);
                    t_8  = Avx2.Xor(t_8, Avx.LoadVector256(m).AsUInt32());
                    t_9  = Avx2.Xor(t_9, Avx.LoadVector256(m + 64).AsUInt32());
                    t_10 = Avx2.Xor(t_10, Avx.LoadVector256(m + 128).AsUInt32());
                    t_11 = Avx2.Xor(t_11, Avx.LoadVector256(m + 192).AsUInt32());
                    t_12 = Avx2.Xor(t_12, Avx.LoadVector256(m + 256).AsUInt32());
                    t_13 = Avx2.Xor(t_13, Avx.LoadVector256(m + 320).AsUInt32());
                    t_14 = Avx2.Xor(t_14, Avx.LoadVector256(m + 384).AsUInt32());
                    t_15 = Avx2.Xor(t_15, Avx.LoadVector256(m + 448).AsUInt32());
                    Avx.Store(c, t_8.AsByte());
                    Avx.Store(c + 64, t_9.AsByte());
                    Avx.Store(c + 128, t_10.AsByte());
                    Avx.Store(c + 192, t_11.AsByte());
                    Avx.Store(c + 256, t_12.AsByte());
                    Avx.Store(c + 320, t_13.AsByte());
                    Avx.Store(c + 384, t_14.AsByte());
                    Avx.Store(c + 448, t_15.AsByte());
                    // ONEOCTO exit
                    m     -= 32;
                    c     -= 32;
                    bytes -= 512;
                    c     += 512;
                    m     += 512;
                }
            }
            if (bytes >= 256)
            {
                Vector128 <uint> x_0  = Vector128.Create(x[0]);
                Vector128 <uint> x_1  = Vector128.Create(x[1]);
                Vector128 <uint> x_2  = Vector128.Create(x[2]);
                Vector128 <uint> x_3  = Vector128.Create(x[3]);
                Vector128 <uint> x_4  = Vector128.Create(x[4]);
                Vector128 <uint> x_5  = Vector128.Create(x[5]);
                Vector128 <uint> x_6  = Vector128.Create(x[6]);
                Vector128 <uint> x_7  = Vector128.Create(x[7]);
                Vector128 <uint> x_8  = Vector128.Create(x[8]);
                Vector128 <uint> x_9  = Vector128.Create(x[9]);
                Vector128 <uint> x_10 = Vector128.Create(x[10]);
                Vector128 <uint> x_11 = Vector128.Create(x[11]);
                Vector128 <uint> x_12;
                Vector128 <uint> x_13;
                Vector128 <uint> x_14   = Vector128.Create(x[14]);
                Vector128 <uint> x_15   = Vector128.Create(x[15]);
                Vector128 <uint> orig0  = x_0;
                Vector128 <uint> orig1  = x_1;
                Vector128 <uint> orig2  = x_2;
                Vector128 <uint> orig3  = x_3;
                Vector128 <uint> orig4  = x_4;
                Vector128 <uint> orig5  = x_5;
                Vector128 <uint> orig6  = x_6;
                Vector128 <uint> orig7  = x_7;
                Vector128 <uint> orig8  = x_8;
                Vector128 <uint> orig9  = x_9;
                Vector128 <uint> orig10 = x_10;
                Vector128 <uint> orig11 = x_11;
                Vector128 <uint> orig12;
                Vector128 <uint> orig13;
                Vector128 <uint> orig14 = x_14;
                Vector128 <uint> orig15 = x_15;
                Vector128 <uint> t12, t13;

                while (bytes >= 256)
                {
                    Vector128 <uint> addv12 = Vector128.Create(0, 1).AsUInt32();
                    Vector128 <uint> addv13 = Vector128.Create(2, 3).AsUInt32();

                    x_0  = orig0;
                    x_1  = orig1;
                    x_2  = orig2;
                    x_3  = orig3;
                    x_4  = orig4;
                    x_5  = orig5;
                    x_6  = orig6;
                    x_7  = orig7;
                    x_8  = orig8;
                    x_9  = orig9;
                    x_10 = orig10;
                    x_11 = orig11;
                    x_14 = orig14;
                    x_15 = orig15;

                    uint  in12   = x[12];
                    uint  in13   = x[13];
                    ulong in1213 = in12 | ((ulong)in13) << 32;
                    t12 = Vector128.Create(in1213).AsUInt32();
                    t13 = Vector128.Create(in1213).AsUInt32();

                    x_12 = Sse2.Add(Vector128.AsUInt64 <uint>(addv12), Vector128.AsUInt64 <uint>(t12)).AsUInt32();
                    x_13 = Sse2.Add(Vector128.AsUInt64 <uint>(addv13), Vector128.AsUInt64 <uint>(t13)).AsUInt32();

                    t12 = Sse2.UnpackLow(x_12, x_13);
                    t13 = Sse2.UnpackHigh(x_12, x_13);

                    x_12 = Sse2.UnpackLow(t12, t13);
                    x_13 = Sse2.UnpackHigh(t12, t13);

                    orig12 = x_12;
                    orig13 = x_13;

                    in1213 += 4;

                    x[12] = (uint)(in1213 & 0xFFFFFFFF);
                    x[13] = (uint)(in1213 >> 32 & 0xFFFFFFFF);

                    for (int i = 0; i < 20; i += 2)
                    {
                        Vec128QuarterRound(ref x_0, ref x_4, ref x_8, ref x_12);
                        Vec128QuarterRound(ref x_1, ref x_5, ref x_9, ref x_13);
                        Vec128QuarterRound(ref x_2, ref x_6, ref x_10, ref x_14);
                        Vec128QuarterRound(ref x_3, ref x_7, ref x_11, ref x_15);
                        Vec128QuarterRound(ref x_0, ref x_5, ref x_10, ref x_15);
                        Vec128QuarterRound(ref x_1, ref x_6, ref x_11, ref x_12);
                        Vec128QuarterRound(ref x_2, ref x_7, ref x_8, ref x_13);
                        Vec128QuarterRound(ref x_3, ref x_4, ref x_9, ref x_14);
                    }
                    OneQuad(ref x_0, ref x_1, ref x_2, ref x_3, ref orig0, ref orig1, ref orig2, ref orig3, m, c);
                    m += 16;
                    c += 16;
                    OneQuad(ref x_4, ref x_5, ref x_6, ref x_7, ref orig4, ref orig5, ref orig6, ref orig7, m, c);
                    m += 16;
                    c += 16;
                    OneQuad(ref x_8, ref x_9, ref x_10, ref x_11, ref orig8, ref orig9, ref orig10, ref orig11, m, c);
                    m += 16;
                    c += 16;
                    OneQuad(ref x_12, ref x_13, ref x_14, ref x_15, ref orig12, ref orig13, ref orig14, ref orig15, m, c);
                    m     -= 48;
                    c     -= 48;
                    bytes -= 256;
                    c     += 256;
                    m     += 256;
                }
            }
            while (bytes >= 64)
            {
                Vector128 <uint> x_0 = Sse2.LoadVector128(x);
                Vector128 <uint> x_1 = Sse2.LoadVector128(x + 4);
                Vector128 <uint> x_2 = Sse2.LoadVector128(x + 8);
                Vector128 <uint> x_3 = Sse2.LoadVector128(x + 12);
                Vector128 <uint> t_1;

                for (int i = 0; i < 20; i += 2)
                {
                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_1 = Sse2.Xor(x_1, x_2);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 12);
                    t_1 = Sse2.ShiftRightLogical(t_1, 20);
                    x_1 = Sse2.Xor(x_1, t_1);

                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_0 = Sse2.Shuffle(x_0, 147);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_3 = Sse2.Shuffle(x_3, 78);
                    x_1 = Sse2.Xor(x_1, x_2);
                    x_2 = Sse2.Shuffle(x_2, 57);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 7);
                    t_1 = Sse2.ShiftRightLogical(t_1, 25);
                    x_1 = Sse2.Xor(x_1, t_1);

                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_1 = Sse2.Xor(x_1, x_2);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 12);
                    t_1 = Sse2.ShiftRightLogical(t_1, 20);
                    x_1 = Sse2.Xor(x_1, t_1);

                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_0 = Sse2.Shuffle(x_0, 57);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_3 = Sse2.Shuffle(x_3, 78);
                    x_1 = Sse2.Xor(x_1, x_2);
                    x_2 = Sse2.Shuffle(x_2, 147);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 7);
                    t_1 = Sse2.ShiftRightLogical(t_1, 25);
                    x_1 = Sse2.Xor(x_1, t_1);
                }
                x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x));
                x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4));
                x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8));
                x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12));
                x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32();
                x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32();
                x_2 = Sse2.Xor(x_2.AsByte(), Sse2.LoadVector128(m + 32)).AsUInt32();
                x_3 = Sse2.Xor(x_3.AsByte(), Sse2.LoadVector128(m + 48)).AsUInt32();
                Sse2.Store(c, x_0.AsByte());
                Sse2.Store(c + 16, x_1.AsByte());
                Sse2.Store(c + 32, x_2.AsByte());
                Sse2.Store(c + 48, x_3.AsByte());

                uint in12 = x[12];
                uint in13 = x[13];
                in12++;
                if (in12 == 0)
                {
                    in13++;
                }
                x[12] = in12;
                x[13] = in13;

                bytes -= 64;
                c     += 64;
                m     += 64;
            }
            if (bytes > 0)
            {
                Vector128 <uint> x_0 = Sse2.LoadVector128(x);
                Vector128 <uint> x_1 = Sse2.LoadVector128(x + 4);
                Vector128 <uint> x_2 = Sse2.LoadVector128(x + 8);
                Vector128 <uint> x_3 = Sse2.LoadVector128(x + 12);
                Vector128 <uint> t_1;
                for (int i = 0; i < 20; i += 2)
                {
                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_1 = Sse2.Xor(x_1, x_2);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 12);
                    t_1 = Sse2.ShiftRightLogical(t_1, 20);
                    x_1 = Sse2.Xor(x_1, t_1);

                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_0 = Sse2.Shuffle(x_0, 0x93);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_3 = Sse2.Shuffle(x_3, 0x4e);
                    x_1 = Sse2.Xor(x_1, x_2);
                    x_2 = Sse2.Shuffle(x_2, 0x39);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 7);
                    t_1 = Sse2.ShiftRightLogical(t_1, 25);
                    x_1 = Sse2.Xor(x_1, t_1);

                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_1 = Sse2.Xor(x_1, x_2);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 12);
                    t_1 = Sse2.ShiftRightLogical(t_1, 20);
                    x_1 = Sse2.Xor(x_1, t_1);

                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_0 = Sse2.Shuffle(x_0, 0x39);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_3 = Sse2.Shuffle(x_3, 0x4e);
                    x_1 = Sse2.Xor(x_1, x_2);
                    x_2 = Sse2.Shuffle(x_2, 0x93);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 7);
                    t_1 = Sse2.ShiftRightLogical(t_1, 25);
                    x_1 = Sse2.Xor(x_1, t_1);
                }
                x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x));
                x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4));
                x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8));
                x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12));
                byte *partialblock = stackalloc byte[64];
                Sse2.Store(partialblock, Vector128.AsByte(x_0));
                Sse2.Store(partialblock + 16, Vector128.AsByte(x_1));
                Sse2.Store(partialblock + 32, Vector128.AsByte(x_2));
                Sse2.Store(partialblock + 48, Vector128.AsByte(x_3));

                for (ulong i = 0; i < bytes; i++)
                {
                    c[i] = (byte)(m[i] ^ partialblock[i]);
                }
                for (int n = 0; n < 64 / sizeof(int); n++)
                {
                    ((int *)partialblock)[n] = 0;
                }
            }
        }
        public override void Step()
        {
            fixed(byte *
                  currentFieldPtr     = field,
                  upperLineSumOf2Ptr  = upperLineSumOf2,
                  upperLineSumOf3Ptr  = upperLineSumOf3,
                  middleLineSumOf2Ptr = middleLineSumOf2,
                  middleLineSumOf3Ptr = middleLineSumOf3,
                  lowerLineSumOf2Ptr  = lowerLineSumOf2,
                  lowerLineSumOf3Ptr  = lowerLineSumOf3)
            {
                byte *upper2 = upperLineSumOf2Ptr, upper3 = upperLineSumOf3Ptr;
                byte *middle2 = middleLineSumOf2Ptr, middle3 = middleLineSumOf3Ptr;
                byte *lower2 = lowerLineSumOf2Ptr, lower3 = lowerLineSumOf3Ptr;
                byte *nextLinePtr = currentFieldPtr + WIDTH;

                for (int x = 0; x < WIDTH; x += 32)
                {
                    Avx2.Store(upper2 + x, Vector256 <byte> .Zero);
                    Avx2.Store(upper3 + x, Vector256 <byte> .Zero);
                    Avx2.Store(middle2 + x, Vector256 <byte> .Zero);
                    Avx2.Store(middle3 + x, Vector256 <byte> .Zero);
                    Vector256 <byte> sum2 = Avx2.Add(Avx2.LoadVector256(nextLinePtr + x - 1), Avx2.LoadVector256(nextLinePtr + x + 1));
                    Avx2.Store(lower2 + x, sum2);
                    Avx2.Store(lower3 + x, Avx2.Add(sum2, Avx2.LoadVector256(nextLinePtr + x)));
                }

                for (int y = 1; y < HEIGHT - 1; y++)
                {
                    nextLinePtr += WIDTH;

                    byte *temp2 = upper2;
                    byte *temp3 = upper3;
                    upper2  = middle2;
                    upper3  = middle3;
                    middle2 = lower2;
                    middle3 = lower3;
                    lower2  = temp2;
                    lower3  = temp3;

                    for (int x = 0; x < WIDTH; x += 32)
                    {
                        Vector256 <byte> left      = Avx2.LoadVector256(nextLinePtr + x - 1);
                        Vector256 <byte> center    = Avx2.LoadVector256(nextLinePtr + x);
                        Vector256 <byte> right     = Avx2.LoadVector256(nextLinePtr + x + 1);
                        Vector256 <byte> lowerSum2 = Avx2.Add(left, right);
                        Vector256 <byte> lowerSum3 = Avx2.Add(lowerSum2, center);

                        Avx2.Store(lower2 + x, lowerSum2);
                        Avx2.Store(lower3 + x, Avx2.Add(lowerSum2, center));

                        Vector256 <byte> neighbours =
                            Avx2.Add(
                                Avx2.LoadVector256(middle2 + x),
                                Avx2.Add(Avx2.LoadVector256(upper3 + x), lowerSum3));
                        Vector256 <byte> alive = Avx2.LoadVector256(nextLinePtr - WIDTH + x);
                        //Avx2.Subtract(Avx2.LoadVector256(middle3 + x), Avx2.LoadVector256(middle2 + x));

                        alive = Avx2.ShiftLeftLogical(alive.AsUInt64(), (byte)3).AsByte();
                        Vector256 <byte> mask          = Avx2.Or(neighbours, alive);
                        Vector256 <byte> shouldBeAlive = Avx2.Shuffle(v_lookup, mask);

                        Avx2.Store(nextLinePtr - WIDTH + x, shouldBeAlive);
                    }
                    *(byte *)(nextLinePtr - WIDTH) = 0;
                    *(byte *)(nextLinePtr - 1)     = 0;
                }
            }
        }
        public unsafe void Test_AVX_BitsToBytesCompressed()
        {
            ulong            x = 0b0000_0001__0010_0011__0100_0101__0110_0111____1000_1001__1010_1011__1100_1101__1110_1111ul;
            Vector256 <byte> mask1, mask2, mask3, zero = Vector256 <byte> .Zero, one, ff, low4, hi4, restore_mask1, restore_mask2;

            byte[] mask1_bytes = new byte[]
            {
                0, 0, 0, 0,
                1, 1, 1, 1,
                2, 2, 2, 2,
                3, 3, 3, 3,
                4, 4, 4, 4,
                5, 5, 5, 5,
                6, 6, 6, 6,
                7, 7, 7, 7,
            };

            byte[] mask2_bytes = new byte[]
            {
                0x01, 0x04, 0x10, 0x40,
                0x01, 0x04, 0x10, 0x40,
                0x01, 0x04, 0x10, 0x40,
                0x01, 0x04, 0x10, 0x40,
                0x01, 0x04, 0x10, 0x40,
                0x01, 0x04, 0x10, 0x40,
                0x01, 0x04, 0x10, 0x40,
                0x01, 0x04, 0x10, 0x40,
            };

            byte[] mask3_bytes = new byte[]
            {
                0x02, 0x08, 0x20, 0x80,
                0x02, 0x08, 0x20, 0x80,
                0x02, 0x08, 0x20, 0x80,
                0x02, 0x08, 0x20, 0x80,
                0x02, 0x08, 0x20, 0x80,
                0x02, 0x08, 0x20, 0x80,
                0x02, 0x08, 0x20, 0x80,
                0x02, 0x08, 0x20, 0x80,
            };

            byte[] restore_mask1_bytes = new byte[]
            {
                0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7, 255,
                0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7, 255,
            };
            byte[] restore_mask2_bytes = new byte[]
            {
                8, 255, 9, 255, 10, 255, 11, 255, 12, 255, 13, 255, 14, 255, 15, 255,
                8, 255, 9, 255, 10, 255, 11, 255, 12, 255, 13, 255, 14, 255, 15, 255,
            };

            fixed(byte *ptr = mask1_bytes) mask1 = Avx2.LoadVector256(ptr);

            fixed(byte *ptr = mask2_bytes) mask2 = Avx2.LoadVector256(ptr);

            fixed(byte *ptr = mask3_bytes) mask3 = Avx2.LoadVector256(ptr);

            fixed(byte *ptr = restore_mask1_bytes) restore_mask1 = Avx2.LoadVector256(ptr);

            fixed(byte *ptr = restore_mask2_bytes) restore_mask2 = Avx2.LoadVector256(ptr);

            byte one_byte = 1;

            one = Avx2.BroadcastScalarToVector256(&one_byte);
            byte ff_byte = 0xff;

            ff = Avx2.BroadcastScalarToVector256(&ff_byte);
            byte lo4_byte = 0x0f;

            low4 = Avx2.BroadcastScalarToVector256(&lo4_byte);
            byte hi4_byte = 0xf0;

            hi4 = Avx2.BroadcastScalarToVector256(&hi4_byte);

            // ***** load **** //
            Vector256 <byte> v = Avx2.BroadcastScalarToVector256(&x).AsByte();

            v = Avx2.Shuffle(v, mask1);
            Vector256 <byte> v1 = Avx2.And(v, mask2);

            v1 = Avx2.Min(v1, one);
            Vector256 <byte> v2 = Avx2.And(v, mask3);

            v2 = Avx2.Min(v2, one);
            v2 = Avx2.ShiftLeftLogical(v2.AsUInt64(), 4).AsByte();
            v  = Avx2.Or(v1, v2);

            // ***** restore **** //

            v1 = Avx2.And(v, low4);
            v2 = Avx2.And(v, hi4);
            v1 = Avx2.CompareEqual(v1, zero);
            v1 = Avx2.AndNot(v1, ff);
            v2 = Avx2.CompareEqual(v2, zero);
            v2 = Avx2.AndNot(v2, ff);

            Vector256 <byte> r1        = Avx2.Shuffle(v1, restore_mask1);
            Vector256 <byte> r2        = Avx2.Shuffle(v1, restore_mask2);
            Vector256 <byte> r3        = Avx2.Shuffle(v2, restore_mask1);
            Vector256 <byte> r4        = Avx2.Shuffle(v2, restore_mask2);
            ulong            restored1 = (uint)Avx2.MoveMask(r1);
            ulong            restored2 = (uint)Avx2.MoveMask(r2);
            ulong            restored3 = (uint)Avx2.MoveMask(r3);
            ulong            restored4 = (uint)Avx2.MoveMask(r4);

            ulong restored_lo = restored1 | (restored3 << 1);
            ulong restored_hi = restored2 | (restored4 << 1);
            ulong restored    =
                (restored_lo & 0xFFFF) |
                (restored_hi & 0xFFFF) << 16 |
                    (restored_lo & 0xFFFF0000) << 16 |
                    (restored_hi & 0xFFFF0000) << 32;

            Assert.AreEqual(x, restored);
        }