Esempio n. 1
0
    public static Vector128 <short> DivideBy10(this Vector128 <short> dividend)
    {
        // Convert to two 32-bit integers
        Vector128 <int> a_hi = Sse2.ShiftRightArithmetic(dividend.AsInt32(), 16);
        Vector128 <int> a_lo = Sse2.ShiftLeftLogical(dividend.AsInt32(), 16);

        a_lo = Sse2.ShiftRightArithmetic(a_lo, 16);

        Vector128 <int> div10_hi;
        Vector128 <int> div10_lo;

        if (Avx2.IsSupported)
        {
            Vector256 <int> a      = Vector256.Create(a_lo, a_hi);
            Vector256 <int> s0     = Avx2.ShiftRightArithmetic(a, 15);
            Vector256 <int> factor = Vector256.Create(26215);
            Vector256 <int> mul    = Avx2.MultiplyLow(a, factor);
            Vector256 <int> s1     = Avx2.ShiftRightArithmetic(mul, 18);
            Vector256 <int> div10  = Avx2.Subtract(s1, s0);

            div10_hi = div10.GetUpper();
            div10_lo = div10.GetLower();
        }
        else
        {
            Vector128 <int> s0_hi = Sse2.ShiftRightArithmetic(a_hi, 15);
            Vector128 <int> s0_lo = Sse2.ShiftRightArithmetic(a_lo, 15);

            Vector128 <int> factor = Vector128.Create(26215);
            Vector128 <int> mul_hi = Sse41.MultiplyLow(a_hi, factor);
            Vector128 <int> mul_lo = Sse41.MultiplyLow(a_lo, factor);

            Vector128 <int> s1_hi = Sse2.ShiftRightArithmetic(mul_hi, 18);
            Vector128 <int> s1_lo = Sse2.ShiftRightArithmetic(mul_lo, 18);

            div10_hi = Sse2.Subtract(s1_hi, s0_hi);
            div10_lo = Sse2.Subtract(s1_lo, s0_lo);
        }

        //div10_hi = Sse2.ShiftLeftLogical(div10_hi, 16);
        div10_hi = Sse2.ShiftLeftLogical128BitLane(div10_hi, 2);
        return(Sse41.Blend(div10_lo.AsInt16(), div10_hi.AsInt16(), 0xAA));
    }
Esempio n. 2
0
        static unsafe int Main(string[] args)
        {
            int testResult = Pass;

            if (Sse41.IsSupported)
            {
                using (TestTable <float> floatTable = new TestTable <float>(new float[4] {
                    1, -5, 100, 0
                }, new float[4] {
                    22, -1, -50, 0
                }, new float[4]))
                {
                    var vf1 = Unsafe.Read <Vector128 <float> >(floatTable.inArray1Ptr);
                    var vf2 = Unsafe.Read <Vector128 <float> >(floatTable.inArray2Ptr);

                    // SDDD
                    var vf3 = Sse41.Blend(vf1, vf2, 1);
                    Unsafe.Write(floatTable.outArrayPtr, vf3);

                    if (!floatTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) &&
                                                (z[2] == x[2]) && (z[3] == x[3])))
                    {
                        Console.WriteLine("SSE41 Blend failed on float:");
                        foreach (var item in floatTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    // DSDD
                    vf3 = Sse41.Blend(vf1, vf2, 2);
                    Unsafe.Write(floatTable.outArrayPtr, vf3);

                    if (!floatTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == y[1]) &&
                                                (z[2] == x[2]) && (z[3] == x[3])))
                    {
                        Console.WriteLine("SSE41 Blend failed on float:");
                        foreach (var item in floatTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    // DDSD
                    vf3 = Sse41.Blend(vf1, vf2, 4);
                    Unsafe.Write(floatTable.outArrayPtr, vf3);

                    if (!floatTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == x[1]) &&
                                                (z[2] == y[2]) && (z[3] == x[3])))
                    {
                        Console.WriteLine("SSE41 Blend failed on float:");
                        foreach (var item in floatTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    // SDSD
                    vf3 = Sse41.Blend(vf1, vf2, 85);
                    Unsafe.Write(floatTable.outArrayPtr, vf3);

                    if (!floatTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) &&
                                                (z[2] == y[2]) && (z[3] == x[3])))
                    {
                        Console.WriteLine("SSE41 Blend failed on float:");
                        foreach (var item in floatTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    // SDDD
                    vf3 = (Vector128 <float>) typeof(Sse41).GetMethod(nameof(Sse41.Blend), new Type[] { vf1.GetType(), vf2.GetType(), typeof(byte) }).Invoke(null, new object[] { vf1, vf2, (byte)(1) });
                    Unsafe.Write(floatTable.outArrayPtr, vf3);

                    if (!floatTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) &&
                                                (z[2] == x[2]) && (z[3] == x[3])))
                    {
                        Console.WriteLine("SSE41 Blend failed on float:");
                        foreach (var item in floatTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }
                }

                using (TestTable <double> doubleTable = new TestTable <double>(new double[2] {
                    1, -5
                }, new double[2] {
                    22, -1
                }, new double[2]))
                {
                    var vf1 = Unsafe.Read <Vector128 <double> >(doubleTable.inArray1Ptr);
                    var vf2 = Unsafe.Read <Vector128 <double> >(doubleTable.inArray2Ptr);

                    // DD
                    var vf3 = Sse41.Blend(vf1, vf2, 0);
                    Unsafe.Write(doubleTable.outArrayPtr, vf3);

                    if (!doubleTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == x[1])))
                    {
                        Console.WriteLine("SSE41 Blend failed on double:");
                        foreach (var item in doubleTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    // SD
                    vf3 = Sse41.Blend(vf1, vf2, 1);
                    Unsafe.Write(doubleTable.outArrayPtr, vf3);

                    if (!doubleTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1])))
                    {
                        Console.WriteLine("SSE41 Blend failed on double:");
                        foreach (var item in doubleTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    // DS
                    vf3 = Sse41.Blend(vf1, vf2, 2);
                    Unsafe.Write(doubleTable.outArrayPtr, vf3);

                    if (!doubleTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == y[1])))
                    {
                        Console.WriteLine("SSE41 Blend failed on double:");
                        foreach (var item in doubleTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    // SS
                    vf3 = Sse41.Blend(vf1, vf2, 51);
                    Unsafe.Write(doubleTable.outArrayPtr, vf3);

                    if (!doubleTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == y[1])))
                    {
                        Console.WriteLine("SSE41 Blend failed on double:");
                        foreach (var item in doubleTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    // SDDD
                    vf3 = (Vector128 <double>) typeof(Sse41).GetMethod(nameof(Sse41.Blend), new Type[] { vf1.GetType(), vf2.GetType(), typeof(byte) }).Invoke(null, new object[] { vf1, vf2, (byte)(0) });
                    Unsafe.Write(doubleTable.outArrayPtr, vf3);

                    if (!doubleTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == x[1])))
                    {
                        Console.WriteLine("SSE41 Blend failed on double:");
                        foreach (var item in doubleTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }
                }

                using (TestTable <short> shortTable = new TestTable <short>(new short[8] {
                    1, -5, 100, 0, 1, -5, 100, 0
                }, new short[8] {
                    22, -1, -50, 0, 22, -1, -50, 0
                }, new short[8]))
                {
                    var vf1 = Unsafe.Read <Vector128 <short> >(shortTable.inArray1Ptr);
                    var vf2 = Unsafe.Read <Vector128 <short> >(shortTable.inArray2Ptr);

                    // SDDD DDDD
                    var vf3 = Sse41.Blend(vf1, vf2, 1);
                    Unsafe.Write(shortTable.outArrayPtr, vf3);

                    if (!shortTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) &&
                                                (z[2] == x[2]) && (z[3] == x[3]) &&
                                                (z[4] == x[4]) && (z[5] == x[5]) &&
                                                (z[6] == x[6]) && (z[7] == x[7])))
                    {
                        Console.WriteLine("SSE41 Blend failed on short:");
                        foreach (var item in shortTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    // DSDD DDDD
                    vf3 = Sse41.Blend(vf1, vf2, 2);
                    Unsafe.Write(shortTable.outArrayPtr, vf3);

                    if (!shortTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == y[1]) &&
                                                (z[2] == x[2]) && (z[3] == x[3]) &&
                                                (z[4] == x[4]) && (z[5] == x[5]) &&
                                                (z[6] == x[6]) && (z[7] == x[7])))
                    {
                        Console.WriteLine("SSE41 Blend failed on short:");
                        foreach (var item in shortTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    // DDSD DDDD
                    vf3 = Sse41.Blend(vf1, vf2, 4);
                    Unsafe.Write(shortTable.outArrayPtr, vf3);

                    if (!shortTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == x[1]) &&
                                                (z[2] == y[2]) && (z[3] == x[3]) &&
                                                (z[4] == x[4]) && (z[5] == x[5]) &&
                                                (z[6] == x[6]) && (z[7] == x[7])))
                    {
                        Console.WriteLine("SSE41 Blend failed on short:");
                        foreach (var item in shortTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    // SDSD SDSD
                    vf3 = Sse41.Blend(vf1, vf2, 85);
                    Unsafe.Write(shortTable.outArrayPtr, vf3);

                    if (!shortTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) &&
                                                (z[2] == y[2]) && (z[3] == x[3]) &&
                                                (z[4] == y[4]) && (z[5] == x[5]) &&
                                                (z[6] == y[6]) && (z[7] == x[7])))
                    {
                        Console.WriteLine("SSE41 Blend failed on short:");
                        foreach (var item in shortTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    // SDDD DDDD
                    vf3 = (Vector128 <short>) typeof(Sse41).GetMethod(nameof(Sse41.Blend), new Type[] { vf1.GetType(), vf2.GetType(), typeof(byte) }).Invoke(null, new object[] { vf1, vf2, (byte)(1) });
                    Unsafe.Write(shortTable.outArrayPtr, vf3);

                    if (!shortTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) &&
                                                (z[2] == x[2]) && (z[3] == x[3]) &&
                                                (z[4] == x[4]) && (z[5] == x[5]) &&
                                                (z[6] == x[6]) && (z[7] == x[7])))
                    {
                        Console.WriteLine("SSE41 Blend failed on short:");
                        foreach (var item in shortTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }
                }

                using (TestTable <ushort> ushortTable = new TestTable <ushort>(new ushort[8] {
                    1, 5, 100, 0, 1, 5, 100, 0
                }, new ushort[8] {
                    22, 1, 50, 0, 22, 1, 50, 0
                }, new ushort[8]))
                {
                    var vf1 = Unsafe.Read <Vector128 <ushort> >(ushortTable.inArray1Ptr);
                    var vf2 = Unsafe.Read <Vector128 <ushort> >(ushortTable.inArray2Ptr);

                    // SDDD DDDD
                    var vf3 = Sse41.Blend(vf1, vf2, 1);
                    Unsafe.Write(ushortTable.outArrayPtr, vf3);

                    if (!ushortTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) &&
                                                 (z[2] == x[2]) && (z[3] == x[3]) &&
                                                 (z[4] == x[4]) && (z[5] == x[5]) &&
                                                 (z[6] == x[6]) && (z[7] == x[7])))
                    {
                        Console.WriteLine("SSE41 Blend failed on ushort:");
                        foreach (var item in ushortTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    // DSDD DDDD
                    vf3 = Sse41.Blend(vf1, vf2, 2);
                    Unsafe.Write(ushortTable.outArrayPtr, vf3);

                    if (!ushortTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == y[1]) &&
                                                 (z[2] == x[2]) && (z[3] == x[3]) &&
                                                 (z[4] == x[4]) && (z[5] == x[5]) &&
                                                 (z[6] == x[6]) && (z[7] == x[7])))
                    {
                        Console.WriteLine("SSE41 Blend failed on ushort:");
                        foreach (var item in ushortTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    // DDSD DDDD
                    vf3 = Sse41.Blend(vf1, vf2, 4);
                    Unsafe.Write(ushortTable.outArrayPtr, vf3);

                    if (!ushortTable.CheckResult((x, y, z) => (z[0] == x[0]) && (z[1] == x[1]) &&
                                                 (z[2] == y[2]) && (z[3] == x[3]) &&
                                                 (z[4] == x[4]) && (z[5] == x[5]) &&
                                                 (z[6] == x[6]) && (z[7] == x[7])))
                    {
                        Console.WriteLine("SSE41 Blend failed on ushort:");
                        foreach (var item in ushortTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    // SDSD SDSD
                    vf3 = Sse41.Blend(vf1, vf2, 85);
                    Unsafe.Write(ushortTable.outArrayPtr, vf3);

                    if (!ushortTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) &&
                                                 (z[2] == y[2]) && (z[3] == x[3]) &&
                                                 (z[4] == y[4]) && (z[5] == x[5]) &&
                                                 (z[6] == y[6]) && (z[7] == x[7])))
                    {
                        Console.WriteLine("SSE41 Blend failed on ushort:");
                        foreach (var item in ushortTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    // SDDD DDDD
                    vf3 = (Vector128 <ushort>) typeof(Sse41).GetMethod(nameof(Sse41.Blend), new Type[] { vf1.GetType(), vf2.GetType(), typeof(byte) }).Invoke(null, new object[] { vf1, vf2, (byte)(1) });
                    Unsafe.Write(ushortTable.outArrayPtr, vf3);

                    if (!ushortTable.CheckResult((x, y, z) => (z[0] == y[0]) && (z[1] == x[1]) &&
                                                 (z[2] == x[2]) && (z[3] == x[3]) &&
                                                 (z[4] == x[4]) && (z[5] == x[5]) &&
                                                 (z[6] == x[6]) && (z[7] == x[7])))
                    {
                        Console.WriteLine("SSE41 Blend failed on ushort:");
                        foreach (var item in ushortTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }
                }
            }

            return(testResult);
        }
Esempio n. 3
0
    public static Vector128 <short> Divide(this Vector128 <short> dividend, Vector128 <short> divisor)
    {
        // Based on https://stackoverflow.com/a/51458507/347870

        // Convert to two 32-bit integers
        Vector128 <int> a_hi_epi32       = Sse2.ShiftRightArithmetic(dividend.AsInt32(), 16);
        Vector128 <int> a_lo_epi32_shift = Sse2.ShiftLeftLogical(dividend.AsInt32(), 16);
        Vector128 <int> a_lo_epi32       = Sse2.ShiftRightArithmetic(a_lo_epi32_shift, 16);

        Vector128 <int> b_hi_epi32       = Sse2.ShiftRightArithmetic(divisor.AsInt32(), 16);
        Vector128 <int> b_lo_epi32_shift = Sse2.ShiftLeftLogical(divisor.AsInt32(), 16);
        Vector128 <int> b_lo_epi32       = Sse2.ShiftRightArithmetic(b_lo_epi32_shift, 16);

        // Convert to 32-bit floats
        Vector128 <float> a_hi = Sse2.ConvertToVector128Single(a_hi_epi32);
        Vector128 <float> a_lo = Sse2.ConvertToVector128Single(a_lo_epi32);
        Vector128 <float> b_hi = Sse2.ConvertToVector128Single(b_hi_epi32);
        Vector128 <float> b_lo = Sse2.ConvertToVector128Single(b_lo_epi32);

        // Calculate the reciprocal
        Vector128 <float> b_hi_rcp = Sse.Reciprocal(b_hi);
        Vector128 <float> b_lo_rcp = Sse.Reciprocal(b_lo);

        // Calculate the inverse
        Vector128 <float> b_hi_inv_1;
        Vector128 <float> b_lo_inv_1;
        Vector128 <float> two = Vector128.Create(2.00000051757f);

        if (Fma.IsSupported)
        {
            b_hi_inv_1 = Fma.MultiplyAddNegated(b_hi_rcp, b_hi, two);
            b_lo_inv_1 = Fma.MultiplyAddNegated(b_lo_rcp, b_lo, two);
        }
        else
        {
            Vector128 <float> b_mul_hi = Sse.Multiply(b_hi_rcp, b_hi);
            Vector128 <float> b_mul_lo = Sse.Multiply(b_lo_rcp, b_lo);
            b_hi_inv_1 = Sse.Subtract(two, b_mul_hi);
            b_lo_inv_1 = Sse.Subtract(two, b_mul_lo);
        }

        // Compensate for the loss
        Vector128 <float> b_hi_rcp_1 = Sse.Multiply(b_hi_rcp, b_hi_inv_1);
        Vector128 <float> b_lo_rcp_1 = Sse.Multiply(b_lo_rcp, b_lo_inv_1);

        // Perform the division by multiplication
        Vector128 <float> hi = Sse.Multiply(a_hi, b_hi_rcp_1);
        Vector128 <float> lo = Sse.Multiply(a_lo, b_lo_rcp_1);

        // Convert back to integers
        Vector128 <int> hi_epi32 = Sse2.ConvertToVector128Int32WithTruncation(hi);
        Vector128 <int> lo_epi32 = Sse2.ConvertToVector128Int32WithTruncation(lo);

        // Zero-out the unnecessary parts
        Vector128 <int> hi_epi32_shift = Sse2.ShiftLeftLogical(hi_epi32, 16);

        // Blend the bits, and return
        if (Sse41.IsSupported)
        {
            return(Sse41.Blend(lo_epi32.AsInt16(), hi_epi32_shift.AsInt16(), 0xAA));
        }
        else
        {
            Vector128 <int> lo_epi32_mask = Sse2.And(lo_epi32, Vector128.Create((ushort)0xFFFF).AsInt16().AsInt32());
            return(Sse2.Or(hi_epi32_shift, lo_epi32_mask).AsInt16());
        }
    }
Esempio n. 4
0
 public static Vector128 <float> _mm_blend_ps(Vector128 <float> left, Vector128 <float> right, byte control)
 {
     return(Sse41.Blend(left, right, control));
 }
Esempio n. 5
0
 public static Vector128 <double> _mm_blend_pd(Vector128 <double> left, Vector128 <double> right, byte control)
 {
     return(Sse41.Blend(left, right, control));
 }
Esempio n. 6
0
 private static Vector128 <ulong> blend_ulong(ref Vector128 <ulong> x, ref Vector128 <ulong> y, byte m) =>
 Sse41.Blend(x.AsUInt16(), y.AsUInt16(), m).AsUInt64();
Esempio n. 7
0
 public static Vector128 <ushort> _mm_blend_epi16(Vector128 <ushort> left, Vector128 <ushort> right, byte control)
 {
     return(Sse41.Blend(left, right, control));
 }
Esempio n. 8
0
 private static Vector128 <ulong> blend_ulong(ref Vector128 <ulong> x, ref Vector128 <ulong> y, byte m) =>
 Sse41.Blend(x.As <ushort>(), y.As <ushort>(), m).As <ulong>();
Esempio n. 9
0
            unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb)
            {
                float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb);
                byte * op = opstart;

#if HWINTRINSICS
                if (Avx2.IsSupported)
                {
                    var vzero  = Vector256 <float> .Zero;
                    var vmin   = Vector256.Create(0.5f / byte.MaxValue);
                    var vscale = Vector256.Create((float)byte.MaxValue);

                    var vmaskp = Avx.LoadVector256((int *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.PermuteMaskDeinterleave8x32)));

                    ipe -= Vector256 <byte> .Count;
                    while (ip <= ipe)
                    {
                        var vf0 = Avx.LoadVector256(ip);
                        var vf1 = Avx.LoadVector256(ip + Vector256 <float> .Count);
                        var vf2 = Avx.LoadVector256(ip + Vector256 <float> .Count * 2);
                        var vf3 = Avx.LoadVector256(ip + Vector256 <float> .Count * 3);
                        ip += Vector256 <byte> .Count;

                        var vfa0 = Avx.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha);
                        var vfa1 = Avx.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha);
                        var vfa2 = Avx.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha);
                        var vfa3 = Avx.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha);

                        vfa0 = Avx.Max(vfa0, vmin);
                        vfa1 = Avx.Max(vfa1, vmin);
                        vfa2 = Avx.Max(vfa2, vmin);
                        vfa3 = Avx.Max(vfa3, vmin);

                        vf0 = Avx.Multiply(vf0, Avx.Reciprocal(vfa0));
                        vf1 = Avx.Multiply(vf1, Avx.Reciprocal(vfa1));
                        vf2 = Avx.Multiply(vf2, Avx.Reciprocal(vfa2));
                        vf3 = Avx.Multiply(vf3, Avx.Reciprocal(vfa3));

                        vf0 = Avx.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha);
                        vf1 = Avx.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha);
                        vf2 = Avx.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha);
                        vf3 = Avx.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha);

                        vf0 = Avx.BlendVariable(vf0, vzero, HWIntrinsics.AvxCompareEqual(vfa0, vmin));
                        vf1 = Avx.BlendVariable(vf1, vzero, HWIntrinsics.AvxCompareEqual(vfa1, vmin));
                        vf2 = Avx.BlendVariable(vf2, vzero, HWIntrinsics.AvxCompareEqual(vfa2, vmin));
                        vf3 = Avx.BlendVariable(vf3, vzero, HWIntrinsics.AvxCompareEqual(vfa3, vmin));

                        vf0 = Avx.Multiply(vf0, vscale);
                        vf1 = Avx.Multiply(vf1, vscale);
                        vf2 = Avx.Multiply(vf2, vscale);
                        vf3 = Avx.Multiply(vf3, vscale);

                        var vi0 = Avx.ConvertToVector256Int32(vf0);
                        var vi1 = Avx.ConvertToVector256Int32(vf1);
                        var vi2 = Avx.ConvertToVector256Int32(vf2);
                        var vi3 = Avx.ConvertToVector256Int32(vf3);

                        var vs0 = Avx2.PackSignedSaturate(vi0, vi1);
                        var vs1 = Avx2.PackSignedSaturate(vi2, vi3);

                        var vb0 = Avx2.PackUnsignedSaturate(vs0, vs1);
                        vb0 = Avx2.PermuteVar8x32(vb0.AsInt32(), vmaskp).AsByte();

                        Avx.Store(op, vb0);
                        op += Vector256 <byte> .Count;
                    }
                    ipe += Vector256 <byte> .Count;
                }
                else if (Sse41.IsSupported)
                {
                    var vzero  = Vector128 <float> .Zero;
                    var vmin   = Vector128.Create(0.5f / byte.MaxValue);
                    var vscale = Vector128.Create((float)byte.MaxValue);

                    ipe -= Vector128 <byte> .Count;
                    while (ip <= ipe)
                    {
                        var vf0 = Sse.LoadVector128(ip);
                        var vf1 = Sse.LoadVector128(ip + Vector128 <float> .Count);
                        var vf2 = Sse.LoadVector128(ip + Vector128 <float> .Count * 2);
                        var vf3 = Sse.LoadVector128(ip + Vector128 <float> .Count * 3);
                        ip += Vector128 <byte> .Count;

                        var vfa0 = Sse.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha);
                        var vfa1 = Sse.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha);
                        var vfa2 = Sse.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha);
                        var vfa3 = Sse.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha);

                        vfa0 = Sse.Max(vfa0, vmin);
                        vfa1 = Sse.Max(vfa1, vmin);
                        vfa2 = Sse.Max(vfa2, vmin);
                        vfa3 = Sse.Max(vfa3, vmin);

                        vf0 = Sse.Multiply(vf0, Sse.Reciprocal(vfa0));
                        vf1 = Sse.Multiply(vf1, Sse.Reciprocal(vfa1));
                        vf2 = Sse.Multiply(vf2, Sse.Reciprocal(vfa2));
                        vf3 = Sse.Multiply(vf3, Sse.Reciprocal(vfa3));

                        vf0 = Sse41.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha);
                        vf1 = Sse41.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha);
                        vf2 = Sse41.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha);
                        vf3 = Sse41.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha);

                        vf0 = Sse41.BlendVariable(vf0, vzero, Sse.CompareEqual(vfa0, vmin));
                        vf1 = Sse41.BlendVariable(vf1, vzero, Sse.CompareEqual(vfa1, vmin));
                        vf2 = Sse41.BlendVariable(vf2, vzero, Sse.CompareEqual(vfa2, vmin));
                        vf3 = Sse41.BlendVariable(vf3, vzero, Sse.CompareEqual(vfa3, vmin));

                        vf0 = Sse.Multiply(vf0, vscale);
                        vf1 = Sse.Multiply(vf1, vscale);
                        vf2 = Sse.Multiply(vf2, vscale);
                        vf3 = Sse.Multiply(vf3, vscale);

                        var vi0 = Sse2.ConvertToVector128Int32(vf0);
                        var vi1 = Sse2.ConvertToVector128Int32(vf1);
                        var vi2 = Sse2.ConvertToVector128Int32(vf2);
                        var vi3 = Sse2.ConvertToVector128Int32(vf3);

                        var vs0 = Sse2.PackSignedSaturate(vi0, vi1);
                        var vs1 = Sse2.PackSignedSaturate(vi2, vi3);

                        var vb0 = Sse2.PackUnsignedSaturate(vs0, vs1);

                        Sse2.Store(op, vb0);
                        op += Vector128 <byte> .Count;
                    }
                    ipe += Vector128 <byte> .Count;
                }
#endif

                float fmax = new Vector4(byte.MaxValue).X, fround = new Vector4(0.5f).X, fmin = fround / fmax;

                while (ip < ipe)
                {
                    float f3 = ip[3];
                    if (f3 < fmin)
                    {
                        *(uint *)op = 0;
                    }
                    else
                    {
                        float f3i = fmax / f3;
                        byte  o0  = ClampToByte((int)(ip[0] * f3i + fround));
                        byte  o1  = ClampToByte((int)(ip[1] * f3i + fround));
                        byte  o2  = ClampToByte((int)(ip[2] * f3i + fround));
                        byte  o3  = ClampToByte((int)(f3 * fmax + fround));
                        op[0] = o0;
                        op[1] = o1;
                        op[2] = o2;
                        op[3] = o3;
                    }

                    ip += 4;
                    op += 4;
                }
            }
Esempio n. 10
0
            unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb)
            {
                fixed(float *atstart = &LookupTables.Alpha[0])
                {
                    byte * ip = ipstart, ipe = ipstart + cb;
                    float *op = (float *)opstart, at = atstart;

#if HWINTRINSICS
                    if (Avx2.IsSupported)
                    {
                        var vscale = Vector256.Create(1f / byte.MaxValue);

                        ipe -= Vector256 <byte> .Count;
                        while (ip <= ipe)
                        {
                            var vi0 = Avx2.ConvertToVector256Int32(ip);
                            var vi1 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count);
                            var vi2 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 2);
                            var vi3 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 3);
                            ip += Vector256 <byte> .Count;

                            var vf0 = Avx.ConvertToVector256Single(vi0);
                            var vf1 = Avx.ConvertToVector256Single(vi1);
                            var vf2 = Avx.ConvertToVector256Single(vi2);
                            var vf3 = Avx.ConvertToVector256Single(vi3);

                            vf0 = Avx.Multiply(vf0, vscale);
                            vf1 = Avx.Multiply(vf1, vscale);
                            vf2 = Avx.Multiply(vf2, vscale);
                            vf3 = Avx.Multiply(vf3, vscale);

                            var vfa0 = Avx.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha);
                            var vfa1 = Avx.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha);
                            var vfa2 = Avx.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha);
                            var vfa3 = Avx.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha);

                            vf0 = Avx.Multiply(vf0, vfa0);
                            vf1 = Avx.Multiply(vf1, vfa1);
                            vf2 = Avx.Multiply(vf2, vfa2);
                            vf3 = Avx.Multiply(vf3, vfa3);

                            vf0 = Avx.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha);
                            vf1 = Avx.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha);
                            vf2 = Avx.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha);
                            vf3 = Avx.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha);

                            Avx.Store(op, vf0);
                            Avx.Store(op + Vector256 <int> .Count, vf1);
                            Avx.Store(op + Vector256 <int> .Count * 2, vf2);
                            Avx.Store(op + Vector256 <int> .Count * 3, vf3);
                            op += Vector256 <byte> .Count;
                        }
                        ipe += Vector256 <byte> .Count;
                    }
                    else if (Sse41.IsSupported)
                    {
                        var vscale = Vector128.Create(1f / byte.MaxValue);

                        ipe -= Vector128 <byte> .Count;
                        while (ip <= ipe)
                        {
                            var vi0 = Sse41.ConvertToVector128Int32(ip);
                            var vi1 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count);
                            var vi2 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 2);
                            var vi3 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 3);
                            ip += Vector128 <byte> .Count;

                            var vf0 = Sse2.ConvertToVector128Single(vi0);
                            var vf1 = Sse2.ConvertToVector128Single(vi1);
                            var vf2 = Sse2.ConvertToVector128Single(vi2);
                            var vf3 = Sse2.ConvertToVector128Single(vi3);

                            vf0 = Sse.Multiply(vf0, vscale);
                            vf1 = Sse.Multiply(vf1, vscale);
                            vf2 = Sse.Multiply(vf2, vscale);
                            vf3 = Sse.Multiply(vf3, vscale);

                            var vfa0 = Sse.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha);
                            var vfa1 = Sse.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha);
                            var vfa2 = Sse.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha);
                            var vfa3 = Sse.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha);

                            vf0 = Sse.Multiply(vf0, vfa0);
                            vf1 = Sse.Multiply(vf1, vfa1);
                            vf2 = Sse.Multiply(vf2, vfa2);
                            vf3 = Sse.Multiply(vf3, vfa3);

                            vf0 = Sse41.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha);
                            vf1 = Sse41.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha);
                            vf2 = Sse41.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha);
                            vf3 = Sse41.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha);

                            Sse.Store(op, vf0);
                            Sse.Store(op + Vector128 <int> .Count, vf1);
                            Sse.Store(op + Vector128 <int> .Count * 2, vf2);
                            Sse.Store(op + Vector128 <int> .Count * 3, vf3);
                            op += Vector128 <byte> .Count;
                        }
                        ipe += Vector128 <byte> .Count;
                    }
#endif

                    while (ip < ipe)
                    {
                        float o0 = at[(uint)ip[0]];
                        float o1 = at[(uint)ip[1]];
                        float o2 = at[(uint)ip[2]];
                        float o3 = at[(uint)ip[3]];
                        ip += 4;

                        op[0] = o0 * o3;
                        op[1] = o1 * o3;
                        op[2] = o2 * o3;
                        op[3] = o3;
                        op   += 4;
                    }
                }
            }
Esempio n. 11
0
 public static __m128 _mm_blend_ps(__m128 left, __m128 right, byte control) => Sse41.Blend(left, right, control);
Esempio n. 12
0
        unsafe private static void mixSse41(Blake2sContext *s, uint *m)
        {
            var row1 = Sse2.LoadVector128(s->h);
            var row2 = Sse2.LoadVector128(s->h + 4);

            var row3 = v128iv0;
            var row4 = v128iv1;

            row4 = Sse2.Xor(row4, Sse2.LoadVector128(s->t));             // reads into f[] as well

            var m0 = Sse2.LoadVector128(m);
            var m1 = Sse2.LoadVector128(m + 4);
            var m2 = Sse2.LoadVector128(m + 8);
            var m3 = Sse2.LoadVector128(m + 12);

            var r16 = v128rm0;
            var r8  = v128rm1;

            //ROUND 1
#if OLD_INTRINSICS
            var b0 = Sse.StaticCast <float, uint>(Sse.Shuffle(Sse.StaticCast <uint, float>(m0), Sse.StaticCast <uint, float>(m1), 0b_10_00_10_00));
#else
            var b0 = Sse.Shuffle(m0.AsSingle(), m1.AsSingle(), 0b_10_00_10_00).AsUInt32();
#endif

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

#if OLD_INTRINSICS
            b0 = Sse.StaticCast <float, uint>(Sse.Shuffle(Sse.StaticCast <uint, float>(m0), Sse.StaticCast <uint, float>(m1), 0b_11_01_11_01));
#else
            b0 = Sse.Shuffle(m0.AsSingle(), m1.AsSingle(), 0b_11_01_11_01).AsUInt32();
#endif

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

#if OLD_INTRINSICS
            b0 = Sse.StaticCast <float, uint>(Sse.Shuffle(Sse.StaticCast <uint, float>(m2), Sse.StaticCast <uint, float>(m3), 0b_10_00_10_00));
#else
            b0 = Sse.Shuffle(m2.AsSingle(), m3.AsSingle(), 0b_10_00_10_00).AsUInt32();
#endif

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

#if OLD_INTRINSICS
            b0 = Sse.StaticCast <float, uint>(Sse.Shuffle(Sse.StaticCast <uint, float>(m2), Sse.StaticCast <uint, float>(m3), 0b_11_01_11_01));
#else
            b0 = Sse.Shuffle(m2.AsSingle(), m3.AsSingle(), 0b_11_01_11_01).AsUInt32();
#endif

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            //ROUND 2
#if OLD_INTRINSICS
            var t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m2), 0b_00_00_11_00));
#else
            var t0 = Sse41.Blend(m1.AsUInt16(), m2.AsUInt16(), 0b_00_00_11_00).AsUInt32();
#endif
            var t1 = Sse2.ShiftLeftLogical128BitLane(m3, 4);
#if OLD_INTRINSICS
            var t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_11_11_00_00));
#else
            var t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_10_01_00_11);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.Shuffle(m2, 0b_00_00_10_00);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m3), 0b_11_00_00_00));
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_11_11_00_00));
#else
            t1 = Sse41.Blend(m1.AsUInt16(), m3.AsUInt16(), 0b_11_00_00_00).AsUInt32();
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_10_11_00_01);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

            t0 = Sse2.ShiftLeftLogical128BitLane(m1, 4);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m2), Sse.StaticCast <uint, ushort>(t0), 0b_00_11_00_00));
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(t1), 0b_11_11_00_00));
#else
            t1 = Sse41.Blend(m2.AsUInt16(), t0.AsUInt16(), 0b_00_11_00_00).AsUInt32();
            t2 = Sse41.Blend(m0.AsUInt16(), t1.AsUInt16(), 0b_11_11_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_10_11_00_01);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.UnpackHigh(m0, m1);
            t1 = Sse2.ShiftLeftLogical128BitLane(m3, 4);
#if OLD_INTRINSICS
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_11_00));
#else
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_10_11_00_01);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            //ROUND 3
            t0 = Sse2.UnpackHigh(m2, m3);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m3), Sse.StaticCast <uint, ushort>(m1), 0b_00_00_11_00));
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_11_11));
#else
            t1 = Sse41.Blend(m3.AsUInt16(), m1.AsUInt16(), 0b_00_00_11_00).AsUInt32();
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_11).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_11_01_00_10);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.UnpackLow(m2, m0);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(m0), 0b_11_11_00_00));
#else
            t1 = Sse41.Blend(t0.AsUInt16(), m0.AsUInt16(), 0b_11_11_00_00).AsUInt32();
#endif
            t2 = Sse2.ShiftLeftLogical128BitLane(m3, 8);
#if OLD_INTRINSICS
            b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t2), 0b_11_00_00_00));
#else
            b0 = Sse41.Blend(t1.AsUInt16(), t2.AsUInt16(), 0b_11_00_00_00).AsUInt32();
#endif

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m2), 0b_00_11_11_00));
#else
            t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_11_11_00).AsUInt32();
#endif
            t1 = Sse2.ShiftRightLogical128BitLane(m1, 12);
#if OLD_INTRINSICS
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_00_11));
#else
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_01_00_11_10);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.ShiftLeftLogical128BitLane(m3, 4);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m1), 0b_00_11_00_11));
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_11_00_00_00));
#else
            t1 = Sse41.Blend(m0.AsUInt16(), m1.AsUInt16(), 0b_00_11_00_11).AsUInt32();
            t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_00_01_10_11);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            //ROUND 4
            t0 = Sse2.UnpackHigh(m0, m1);
            t1 = Sse2.UnpackHigh(t0, m2);
#if OLD_INTRINSICS
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_11_00));
#else
            t2 = Sse41.Blend(t1.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_11_01_00_10);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.ShiftLeftLogical128BitLane(m2, 8);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m3), Sse.StaticCast <uint, ushort>(m0), 0b_00_00_11_00));
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_11_00_00_00));
#else
            t1 = Sse41.Blend(m3.AsUInt16(), m0.AsUInt16(), 0b_00_00_11_00).AsUInt32();
            t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_10_00_01_11);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m1), 0b_00_00_11_11));
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(m3), 0b_11_00_00_00));
#else
            t0 = Sse41.Blend(m0.AsUInt16(), m1.AsUInt16(), 0b_00_00_11_11).AsUInt32();
            t1 = Sse41.Blend(t0.AsUInt16(), m3.AsUInt16(), 0b_11_00_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t1, 0b_11_00_01_10);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.UnpackLow(m0, m2);
            t1 = Sse2.UnpackHigh(m1, m2);
#if OLD_INTRINSICS
            b0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(t1), Sse.StaticCast <uint, ulong>(t0)));
#else
            b0 = Sse2.UnpackLow(t1.AsUInt64(), t0.AsUInt64()).AsUInt32();
#endif

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            //ROUND 5
#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m1), Sse.StaticCast <uint, ulong>(m2)));
            t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(m2)));
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_00_11));
#else
            t0 = Sse2.UnpackLow(m1.AsUInt64(), m2.AsUInt64()).AsUInt32();
            t1 = Sse2.UnpackHigh(m0.AsUInt64(), m2.AsUInt64()).AsUInt32();
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_10_00_01_11);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m1), Sse.StaticCast <uint, ulong>(m3)));
            t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(m1)));
            b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_00_11));
#else
            t0 = Sse2.UnpackHigh(m1.AsUInt64(), m3.AsUInt64()).AsUInt32();
            t1 = Sse2.UnpackLow(m0.AsUInt64(), m1.AsUInt64()).AsUInt32();
            b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32();
#endif

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m3), Sse.StaticCast <uint, ulong>(m1)));
            t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m2), Sse.StaticCast <uint, ulong>(m0)));
            b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_00_11_00_11));
#else
            t0 = Sse2.UnpackHigh(m3.AsUInt64(), m1.AsUInt64()).AsUInt32();
            t1 = Sse2.UnpackHigh(m2.AsUInt64(), m0.AsUInt64()).AsUInt32();
            b0 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_11_00_11).AsUInt32();
#endif

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m2), 0b_00_00_00_11));
#else
            t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_00_00_11).AsUInt32();
#endif
            t1 = Sse2.ShiftLeftLogical128BitLane(t0, 8);
#if OLD_INTRINSICS
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_11_11));
#else
            t2 = Sse41.Blend(t1.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_11).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_01_10_00_11);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            //ROUND 6
            t0 = Sse2.UnpackHigh(m0, m1);
            t1 = Sse2.UnpackLow(m0, m2);
#if OLD_INTRINSICS
            b0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(t0), Sse.StaticCast <uint, ulong>(t1)));
#else
            b0 = Sse2.UnpackLow(t0.AsUInt64(), t1.AsUInt64()).AsUInt32();
#endif

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.ShiftRightLogical128BitLane(m2, 4);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_00_11));
            b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_00_11_11_00));
#else
            t1 = Sse41.Blend(m0.AsUInt16(), m3.AsUInt16(), 0b_00_00_00_11).AsUInt32();
            b0 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_11_11_00).AsUInt32();
#endif

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m0), 0b_00_00_11_00));
#else
            t0 = Sse41.Blend(m1.AsUInt16(), m0.AsUInt16(), 0b_00_00_11_00).AsUInt32();
#endif
            t1 = Sse2.ShiftRightLogical128BitLane(m3, 4);
#if OLD_INTRINSICS
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_00_00));
#else
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_01_10_11_00);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m1), Sse.StaticCast <uint, ulong>(m2)));
#else
            t0 = Sse2.UnpackLow(m1.AsUInt64(), m2.AsUInt64()).AsUInt32();
#endif
            t1 = Sse2.Shuffle(m3, 0b_00_10_00_01);
#if OLD_INTRINSICS
            b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_00_11));
#else
            b0 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_00_11).AsUInt32();
#endif

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            //ROUND 7
            t0 = Sse2.ShiftLeftLogical128BitLane(m1, 12);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m3), 0b_00_11_00_11));
            b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_11_00_00_00));
#else
            t1 = Sse41.Blend(m0.AsUInt16(), m3.AsUInt16(), 0b_00_11_00_11).AsUInt32();
            b0 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32();
#endif

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m3), Sse.StaticCast <uint, ushort>(m2), 0b_00_11_00_00));
#else
            t0 = Sse41.Blend(m3.AsUInt16(), m2.AsUInt16(), 0b_00_11_00_00).AsUInt32();
#endif
            t1 = Sse2.ShiftRightLogical128BitLane(m1, 4);
#if OLD_INTRINSICS
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_00_11));
#else
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_10_01_11_00);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(m2)));
#else
            t0 = Sse2.UnpackLow(m0.AsUInt64(), m2.AsUInt64()).AsUInt32();
#endif
            t1 = Sse2.ShiftRightLogical128BitLane(m1, 4);
#if OLD_INTRINSICS
            b0 = Sse2.Shuffle(Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_11_00)), 0b_10_11_01_00);
#else
            b0 = Sse2.Shuffle(Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_00).AsUInt32(), 0b_10_11_01_00);
#endif

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.UnpackHigh(m1, m2);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(t0)));
#else
            t1 = Sse2.UnpackHigh(m0.AsUInt64(), t0.AsUInt64()).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t1, 0b_11_00_01_10);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            //ROUND 8
            t0 = Sse2.UnpackHigh(m0, m1);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_11_11));
#else
            t1 = Sse41.Blend(t0.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_11).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t1, 0b_10_00_11_01);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m2), Sse.StaticCast <uint, ushort>(m3), 0b_00_11_00_00));
#else
            t0 = Sse41.Blend(m2.AsUInt16(), m3.AsUInt16(), 0b_00_11_00_00).AsUInt32();
#endif
            t1 = Sse2.ShiftRightLogical128BitLane(m0, 4);
#if OLD_INTRINSICS
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_00_11));
#else
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_00_11).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_01_00_10_11);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(m0), Sse.StaticCast <uint, ulong>(m3)));
            t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(m1), Sse.StaticCast <uint, ulong>(m2)));
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_11_11_00));
#else
            t0 = Sse2.UnpackHigh(m0.AsUInt64(), m3.AsUInt64()).AsUInt32();
            t1 = Sse2.UnpackLow(m1.AsUInt64(), m2.AsUInt64()).AsUInt32();
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_11_11_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_00_10_11_01);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.UnpackLow(m0, m1);
            t1 = Sse2.UnpackHigh(m1, m2);
#if OLD_INTRINSICS
            b0 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(t0), Sse.StaticCast <uint, ulong>(t1)));
#else
            b0 = Sse2.UnpackLow(t0.AsUInt64(), t1.AsUInt64()).AsUInt32();
#endif

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            //ROUND 9
            t0 = Sse2.UnpackHigh(m1, m3);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ulong, uint>(Sse2.UnpackLow(Sse.StaticCast <uint, ulong>(t0), Sse.StaticCast <uint, ulong>(m0)));
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(m2), 0b_11_00_00_00));
            b0 = Sse.StaticCast <ushort, uint>(Sse2.ShuffleHigh(Sse.StaticCast <uint, ushort>(t2), 0b_01_00_11_10));
#else
            t1 = Sse2.UnpackLow(t0.AsUInt64(), m0.AsUInt64()).AsUInt32();
            t2 = Sse41.Blend(t1.AsUInt16(), m2.AsUInt16(), 0b_11_00_00_00).AsUInt32();
            b0 = Sse2.ShuffleHigh(t2.AsUInt16(), 0b_01_00_11_10).AsUInt32();
#endif

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.UnpackHigh(m0, m3);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m2), Sse.StaticCast <uint, ushort>(t0), 0b_11_11_00_00));
#else
            t1 = Sse41.Blend(m2.AsUInt16(), t0.AsUInt16(), 0b_11_11_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t1, 0b_00_10_01_11);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m2), Sse.StaticCast <uint, ushort>(m0), 0b_00_00_11_00));
#else
            t0 = Sse41.Blend(m2.AsUInt16(), m0.AsUInt16(), 0b_00_00_11_00).AsUInt32();
#endif
            t1 = Sse2.ShiftLeftLogical128BitLane(t0, 4);
#if OLD_INTRINSICS
            b0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(m3), 0b_00_00_11_11));
#else
            b0 = Sse41.Blend(t1.AsUInt16(), m3.AsUInt16(), 0b_00_00_11_11).AsUInt32();
#endif

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m0), 0b_00_11_00_00));
#else
            t0 = Sse41.Blend(m1.AsUInt16(), m0.AsUInt16(), 0b_00_11_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t0, 0b_01_00_11_10);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            //ROUND 10
#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m0), Sse.StaticCast <uint, ushort>(m2), 0b_00_00_00_11));
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(m2), 0b_00_11_00_00));
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t1), Sse.StaticCast <uint, ushort>(t0), 0b_00_00_11_11));
#else
            t0 = Sse41.Blend(m0.AsUInt16(), m2.AsUInt16(), 0b_00_00_00_11).AsUInt32();
            t1 = Sse41.Blend(m1.AsUInt16(), m2.AsUInt16(), 0b_00_11_00_00).AsUInt32();
            t2 = Sse41.Blend(t1.AsUInt16(), t0.AsUInt16(), 0b_00_00_11_11).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_01_11_00_10);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

            t0 = Sse2.ShiftLeftLogical128BitLane(m0, 4);
#if OLD_INTRINSICS
            t1 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m1), Sse.StaticCast <uint, ushort>(t0), 0b_11_00_00_00));
#else
            t1 = Sse41.Blend(m1.AsUInt16(), t0.AsUInt16(), 0b_11_00_00_00).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t1, 0b_01_10_00_11);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //DIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_10_01_00_11);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_00_11_10_01);

            t0 = Sse2.UnpackHigh(m0, m3);
            t1 = Sse2.UnpackLow(m2, m3);
#if OLD_INTRINSICS
            t2 = Sse.StaticCast <ulong, uint>(Sse2.UnpackHigh(Sse.StaticCast <uint, ulong>(t0), Sse.StaticCast <uint, ulong>(t1)));
#else
            t2 = Sse2.UnpackHigh(t0.AsUInt64(), t1.AsUInt64()).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_11_00_10_01);

            //G1
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r16));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r16).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 12), Sse2.ShiftLeftLogical(row2, 20));

#if OLD_INTRINSICS
            t0 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(m3), Sse.StaticCast <uint, ushort>(m2), 0b_11_00_00_00));
#else
            t0 = Sse41.Blend(m3.AsUInt16(), m2.AsUInt16(), 0b_11_00_00_00).AsUInt32();
#endif
            t1 = Sse2.UnpackLow(m0, m3);
#if OLD_INTRINSICS
            t2 = Sse.StaticCast <ushort, uint>(Sse41.Blend(Sse.StaticCast <uint, ushort>(t0), Sse.StaticCast <uint, ushort>(t1), 0b_00_00_11_11));
#else
            t2 = Sse41.Blend(t0.AsUInt16(), t1.AsUInt16(), 0b_00_00_11_11).AsUInt32();
#endif
            b0 = Sse2.Shuffle(t2, 0b_00_01_10_11);

            //G2
            row1 = Sse2.Add(Sse2.Add(row1, b0), row2);
            row4 = Sse2.Xor(row4, row1);
#if OLD_INTRINSICS
            row4 = Sse.StaticCast <sbyte, uint>(Ssse3.Shuffle(Sse.StaticCast <uint, sbyte>(row4), r8));
#else
            row4 = Ssse3.Shuffle(row4.AsSByte(), r8).AsUInt32();
#endif

            row3 = Sse2.Add(row3, row4);
            row2 = Sse2.Xor(row2, row3);
            row2 = Sse2.Xor(Sse2.ShiftRightLogical(row2, 7), Sse2.ShiftLeftLogical(row2, 25));

            //UNDIAGONALIZE
            row4 = Sse2.Shuffle(row4, 0b_00_11_10_01);
            row3 = Sse2.Shuffle(row3, 0b_01_00_11_10);
            row2 = Sse2.Shuffle(row2, 0b_10_01_00_11);

            row1 = Sse2.Xor(row1, row3);
            row2 = Sse2.Xor(row2, row4);
            row1 = Sse2.Xor(row1, Sse2.LoadVector128(s->h));
            row2 = Sse2.Xor(row2, Sse2.LoadVector128(s->h + 4));
            Sse2.Store(s->h, row1);
            Sse2.Store(s->h + 4, row2);
        }